This notebook performs a basic overview of the data and analyses the
genetic diversity indicators. It uses as input the “clean kobo output”
that was first cleaned by 1.2_cleaning.
Load required libraries:
library(tidyr)
library(dplyr)
library(utile.tools)
library(stringr)
library(ggplot2)
library(ggsankey)
library(alluvial)
library(viridis)
library(cowplot)
Load required functions. These custom fuctions are available at: https://github.com/AliciaMstt/GeneticIndicators
source("get_indicator1_data.R")
source("get_indicator2_data.R")
source("get_indicator3_data.R")
source("get_metadata.R")
source("transform_to_Ne.R")
source("estimate_indicator1.R")
Other custom functions:
# to imitate ggplot colors, thanks to https://stackoverflow.com/questions/8197559/emulate-ggplot2-default-color-palette
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
# not in
'%!in%' <- function(x,y)!('%in%'(x,y))
Custom colors:
## IUCN official colors
IUCNcolors<-c("brown2", "darkgrey", "darkorange", "darkgreen", "bisque1", "green", "azure2", "yellow")
## nice soft ramp for taxonomic groups
taxoncolors<-cividis(12) # same than using cividis(length(levels(as.factor(metadata$taxonomic_group))))
## Colors for simplified methods to define populations
# assuming the following (see below, it wont work here): levels(as.factor(ind2_data$defined_populations_simplified))
# [1] "adaptive_traits management_units" "eco_biogeo_proxies" "genetic_clusters"
# [4] "genetic_clusters eco_biogeo_proxies" "genetic_clusters geographic_boundaries" "geographic_boundaries"
# [7] "geographic_boundaries adaptive_traits" "geographic_boundaries eco_biogeo_proxies" "geographic_boundaries management_units"
# [10] "low_freq_combinations" "management_units" "other"
# get a set of colors to highlight genetic and geographic with similar colors
simplifiedmethods_colors<-c("#b34656", #"adaptive_traits management_units"
"#7f611b", # "eco_biogeo_proxies"
"#668cd1", # "genetic_clusters"
"#668cd1", # "genetic_clusters eco_biogeo_proxies"
"#45c097", # "genetic_clusters geographic_boundaries"
"#d4b43e", # "geographic_boundaries"
"#d4b43e", # "geographic_boundaries adaptive_traits"
"#d4b43e", # "geographic_boundaries eco_biogeo_proxies"
"#d4b43e", # "geographic_boundaries management_units"
"#be72c9", # "low_freq_combinations"
"#b93921", # "management_units"
"#d868a2")# "other"
Get indicators data from clean kobo output
# Get data:
kobo_clean<-read.csv(file="kobo_output_clean.csv", header=TRUE)
# Extract indicator 1 data from kobo output, show most relevant columns
ind1_data<-get_indicator1_data(kobo_output=kobo_clean)
## [1] "the data already contained a taxon column, that was used instead of creating a new one"
head(ind1_data[,c(1:3, 12:14)])
# Extract indicator 2 data from kobo output, show most relevant columns
ind2_data<-get_indicator2_data(kobo_output=kobo_clean)
## [1] "the data already contained a taxon column, that was used instead of creating a new one"
head(ind2_data[,c(1:3, 9:10,13)])
# Extract indicator 3 data from kobo output, show most relevant columns
ind3_data<-get_indicator3_data(kobo_output=kobo_clean)
## [1] "the data already contained a taxon column, that was used instead of creating a new one"
head(ind3_data[,c(1:3, 9:11)])
# extract metadata, show most relevant columns
metadata<-get_metadata(kobo_output=kobo_clean)
## [1] "the data already contained a taxon column, that was used instead of creating a new one"
head(metadata[,c(1:3, 12, 25,26, 64)])
# save processed data
write.csv(ind1_data, "ind1_data.csv", row.names = FALSE, fileEncoding = "UTF-8")
write.csv(ind2_data, "ind2_data.csv", row.names = FALSE, fileEncoding = "UTF-8")
write.csv(ind3_data, "ind3_data.csv", row.names = FALSE, fileEncoding = "UTF-8")
write.csv(metadata, "metadata.csv", row.names = FALSE, fileEncoding = "UTF-8")
The methods used to define populations come from a check box question were one or more of the following categories can be selected: genetic_clusters, geographic_boundaries, eco_biogeo_proxies, adaptive_traits, management_units, other. As a consequence any combination of the former can be possible. Leading to the following results:
table(ind2_data$defined_populations)
##
## adaptive_traits
## 5
## adaptive_traits management_units
## 20
## eco_biogeo_proxies
## 41
## eco_biogeo_proxies adaptive_traits
## 2
## eco_biogeo_proxies management_units
## 10
## eco_biogeo_proxies other
## 2
## genetic_clusters
## 107
## genetic_clusters adaptive_traits
## 7
## genetic_clusters eco_biogeo_proxies
## 20
## genetic_clusters eco_biogeo_proxies adaptive_traits
## 3
## genetic_clusters eco_biogeo_proxies adaptive_traits management_units
## 2
## genetic_clusters eco_biogeo_proxies management_units
## 1
## genetic_clusters geographic_boundaries
## 74
## genetic_clusters geographic_boundaries adaptive_traits
## 5
## genetic_clusters geographic_boundaries eco_biogeo_proxies
## 7
## genetic_clusters geographic_boundaries eco_biogeo_proxies adaptive_traits
## 1
## genetic_clusters geographic_boundaries eco_biogeo_proxies adaptive_traits management_units
## 1
## genetic_clusters geographic_boundaries eco_biogeo_proxies management_units
## 1
## genetic_clusters geographic_boundaries management_units
## 8
## genetic_clusters management_units
## 11
## genetic_clusters other
## 2
## geographic_boundaries
## 276
## geographic_boundaries adaptive_traits
## 32
## geographic_boundaries adaptive_traits management_units
## 12
## geographic_boundaries adaptive_traits management_units other
## 1
## geographic_boundaries eco_biogeo_proxies
## 75
## geographic_boundaries eco_biogeo_proxies adaptive_traits
## 3
## geographic_boundaries eco_biogeo_proxies management_units
## 3
## geographic_boundaries eco_biogeo_proxies other
## 2
## geographic_boundaries management_units
## 25
## geographic_boundaries other
## 12
## management_units
## 132
## management_units other
## 2
## other
## 19
It is hard to group the above methods, so we will keep the original groups with n >=19 in the above list, and tag the combinations that appear few times as as “low_freq_combinations”.
Which groups have n>=19?
x<-as.data.frame(table(ind2_data$defined_populations)[table(ind2_data$defined_populations) >= 19])
colnames(x)[1]<-"method"
x
## (CONSIDER ADDING THIS CHUNK TO THE GET_ FUNCTIONS?)
### for ind2_data
ind2_data<- ind2_data %>%
mutate(defined_populations_simplified = case_when(
# if the method is in the list of methods n>=19 then keep it
defined_populations %in% x$method ~ defined_populations,
TRUE ~ "low_freq_combinations"))
### for meta
metadata<- metadata %>%
mutate(defined_populations_simplified = case_when(
# if the method is in the list of methods n>=19 then keep it
defined_populations %in% x$method ~ defined_populations,
TRUE ~ "low_freq_combinations"))
Check n for simplified methods:
table(ind2_data$defined_populations_simplified)
##
## adaptive_traits management_units
## 20
## eco_biogeo_proxies
## 41
## genetic_clusters
## 107
## genetic_clusters eco_biogeo_proxies
## 20
## genetic_clusters geographic_boundaries
## 74
## geographic_boundaries
## 276
## geographic_boundaries adaptive_traits
## 32
## geographic_boundaries eco_biogeo_proxies
## 75
## geographic_boundaries management_units
## 25
## low_freq_combinations
## 103
## management_units
## 132
## other
## 19
Another option is to highlight if genetic_cluster or geographic_boundaries were used at all, which are the main drivers. This will look like:
### for ind2_data
#ind2_data<- ind2_data %>%
# mutate(defined_populations_geneticgeo = case_when(
#### PENDIENTE USE grepl to show TRUE for geo, gen, geo AND gen, others.
#))
Table of equivalences:
ind2_data %>%
select(defined_populations, defined_populations_simplified) %>%
filter(!duplicated(defined_populations))
Records by country, including taxa assessed more than once (see below for details on this)
ggplot(metadata, aes(x=country_assessment)) +
geom_bar(stat = "count") +
ggtitle("Number of taxa assessedd by country, \nincluding duplicated taxa")
Did countries used kobo or tabular?
ggplot(metadata, aes(x=country_assessment, fill=kobo_tabular)) +
geom_bar(stat = "count") +
ggtitle("Number of taxa assessedd by country, \nincluding duplicated taxa")
Records by taxonomic groups
ggplot(metadata, aes(x=taxonomic_group)) +
geom_bar(stat = "count") +
theme(axis.text.x = element_text(angle = 45)) +
ggtitle("Number of taxa assessedd by taxonomic group, \nincluding duplicated taxa")
Some taxa were assessed twice, for example to account for uncertainty
on how to divide populations. This information is stored in variable
multiassessment of the metadata (created by
get_metadata()). An example of taxa with multiple
assessments:
metadata %>%
filter(multiassessment=="multiassessment") %>%
select(taxonomic_group, taxon, country_assessment, multiassessment) %>%
arrange(taxon, country_assessment) %>%
head()
In total these are the number or records (assessment) done for both categories:
table(metadata$multiassessment)
##
## multiassessment single_assessment
## 93 831
The above numbers refer to the number or records, if what we want is to know how many taxa were analysed for each category, then:
Number of taxa with multiple submissions:
multi_taxa<- metadata %>%
filter(multiassessment=="multiassessment") %>%
select(taxon, country_assessment, taxonomic_group) %>%
unique()
# how many?
nrow(multi_taxa)
## [1] 46
Number of taxa with single submissions:
single_taxa<- metadata %>%
filter(multiassessment=="single_assessment") %>%
select(taxon, country_assessment, taxonomic_group) %>%
unique()
# how many?
nrow(single_taxa)
## [1] 831
To explore what kind of taxa countries assessed regardless of if they assessed them once or more, lets create a dataset keeping all single assessed taxa, plus only the first assessment for taxa assessed multiple times.
# object with single assessed taxa, plus only the first assessment for taxa assessed multiple times
metadata_firstmulti<-metadata[!duplicated(cbind(metadata$taxon, metadata$country_assessment)), ]
How many records?
# how many?
nrow(metadata_firstmulti)
## [1] 877
Of which countries and taxonomic groups are the taxa that were assessed more than once?
metadata_firstmulti %>% # we use the _firstmulti dataset so that multiassesed records are counted only once
filter(multiassessment=="multiassessment") %>%
ggplot(aes(x=country_assessment)) +
geom_bar(stat = "count") +
ggtitle("Number of taxa assessed more than once, by country")
metadata_firstmulti %>% # we use the _unique dataset so that multiassesed records are counted only once
filter(multiassessment=="multiassessment") %>%
ggplot(aes(x=taxonomic_group, fill=country_assessment)) +
geom_bar(stat = "count") +
theme(axis.text.x = element_text(angle = 45)) +
ggtitle("Number of taxa assessed more than once")
Now check taxa assessed excluding duplicates, i.e. the real number of taxa assessed. This will be used in downstream analyses
ggplot(metadata_firstmulti, aes(x=country_assessment)) +
geom_bar(stat = "count") +
ggtitle("Number of taxa assessed by country")
ggplot(metadata_firstmulti, aes(x=taxonomic_group)) +
geom_bar(stat = "count") +
theme(axis.text.x = element_text(angle = 45)) +
ggtitle("Number of taxa assessed by taxonomic group")
Note: The following plots in this section consider only one record of the taxa that were assessed more than once. That is a total of 877 taxa.
Note on alluvial vs Sankey, taken from ggalluvial: An important feature of alluvial plots is the meaningfulness of the vertical axis: No gaps are inserted between the strata, so the total height of the plot reflects the cumulative quantity of the observations. The plots produced by {ggalluvial} conform to the “grammar of graphics” principles of {ggplot2}, and this prevents users from producing “free-floating” visualizations like the Sankey diagrams
Using alluvial:
library(alluvial)
# estimate frequencies for alluvial plot
foralluvial_1<-metadata_firstmulti %>% group_by(country_assessment, taxonomic_group) %>%
summarise(n=n())
head(foralluvial_1)
## plot
# define colors
my_cols<- gg_color_hue(length(unique(foralluvial_1$country_assessment)))
# we need a vector of colors by country for each row of the dataset, so:
countries<-as.factor(foralluvial_1$country_assessment)
levels(countries)<-my_cols
countries<-as.vector(countries)
head(countries)
## [1] "#F8766D" "#F8766D" "#F8766D" "#F8766D" "#F8766D" "#F8766D"
# plot
alluvial(foralluvial_1[,1:2], freq = foralluvial_1$n,
col=countries,
blocks=FALSE,
gap.width = 0.3,
cex=.8,
xw = 0.2,
cw = 0.1,
border = NA)
Using ggsankey
library(ggsankey)
# transform data to how ggsankey wants it
df <- metadata_firstmulti %>%
make_long(country_assessment, taxonomic_group)
# Sankey plot
ggplot(df, aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
fill = factor(node),
label = node)) +
geom_sankey(flow.alpha = 0.6,
node.color = NA,
show.legend = FALSE) +
geom_sankey_label(size = 2, color = 1, fill = "white") +
# colour by country
scale_fill_manual(values=c(scales::hue_pal()(length(unique(metadata$country_assessment))), # a nice color for each of the n countries
rep("darkgrey", length(unique(metadata$taxonomic_group)))), # grey for the n taxonomic groups
breaks=c(unique(metadata$country_assessment),
unique(metadata$taxonomic_group))) +
theme_sankey(base_size = 10) +
xlab("")
Using alluvial:
library(alluvial)
# estimate frequencies for alluvial plot
foralluvial_1<-metadata_firstmulti %>% group_by(country_assessment, taxonomic_group, popsize_data) %>%
summarise(n=n())
head(foralluvial_1)
## plot
# define colors
my_cols<- gg_color_hue(length(unique(foralluvial_1$country_assessment)))
# we need a vector of colors by country for each row of the dataset, so:
countries<-as.factor(foralluvial_1$country_assessment)
levels(countries)<-my_cols
countries<-as.vector(countries)
head(countries)
## [1] "#F8766D" "#F8766D" "#F8766D" "#F8766D" "#F8766D" "#F8766D"
# plot
alluvial(foralluvial_1[,1:3], freq = foralluvial_1$n,
col=countries,
blocks=FALSE,
gap.width = 0.5,
cex=.8,
xw = 0.1,
cw = 0.2,
border = NA)
Using ggsankey option 1
# transform data to how ggsankey wants it
df <- metadata_firstmulti %>%
make_long(country_assessment, taxonomic_group, popsize_data)
# Sankey
ggplot(df, aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
fill = factor(node),
label = node)) +
geom_sankey(flow.alpha = 0.6,
show.legend = FALSE) +
# manually set flow fill according to countries and popsize
# a nice color for each of the n countries for the first (left) part
scale_fill_manual(values=c(scales::hue_pal()(length(unique(metadata_firstmulti$country_assessment))),
# grey for the taxonomic groups
rep("darkgrey", length(unique(metadata_firstmulti$taxonomic_group))),
# three colors for popsize_data
c("darkolivegreen", "darkgoldenrod1", "brown3")),
breaks=c(unique(metadata_firstmulti$country_assessment),
unique(metadata_firstmulti$taxonomic_group),
unique(metadata_firstmulti$popsize_data))) +
geom_sankey_label(size = 2, color = 1, fill = "white") +
theme_sankey(base_size = 10) +
xlab("")
Using ggsankey option 2
# transform data to how ggsankey wants it
df <- metadata_firstmulti %>%
make_long(country_assessment, taxonomic_group, popsize_data)
# Sankey
ggplot(df, aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
fill = factor(node),
label = node)) +
geom_sankey(flow.alpha = 0.6,
show.legend = FALSE) +
# manually set flow fill according to countries and popsize
# a nice color for each of the n countries for the first (left) part
scale_fill_manual(values=c(scales::hue_pal()(length(unique(metadata_firstmulti$country_assessment))),
# gradient of soft colors for taxonomic groups
taxoncolors,
# traffic light for pop data
c("darkolivegreen", "darkgoldenrod1", "brown3")),
breaks=c(unique(metadata_firstmulti$country_assessment),
levels(as.factor(metadata_firstmulti$taxonomic_group)),
unique(metadata_firstmulti$popsize_data))) +
geom_sankey_label(size = 2.5, color = "black", fill = "white") +
theme_sankey(base_size = 10) +
xlab("")
Bar plots
ggplot(metadata, aes(x=taxonomic_group, fill=popsize_data)) +
geom_bar(stat = "count") +
coord_flip() +
facet_wrap(~country_assessment, ncol = 4) +
scale_fill_manual(values=c("#1f77b4", "grey80", "#2ca02c"),
breaks=c(levels(as.factor(metadata$popsize_data)))) +
theme_light()
Using alluvial: (pending to fix a NA in Cucurbita argyrosperma)
library(alluvial)
# estimate frequencies for alluvial plot
foralluvial_1<-metadata_firstmulti %>% group_by(taxonomic_group, global_IUCN, popsize_data) %>%
summarise(n=n())
## plot
# define colors according to IUCN categories
# check order of categories:
levels(as.factor(metadata_firstmulti$global_IUCN))
# create vector of colors accordingly:
my_cols<-IUCNcolors
# we need a vector of colors by iucn for each row of the dataset, so:
gIUCN<-as.factor(foralluvial_1$global_IUCN)
levels(gIUCN)<-my_cols
gIUCN<-as.vector(gIUCN)
head(gIUCN)
# plot
alluvial(foralluvial_1[,1:3], freq = foralluvial_1$n,
col=gIUCN,
blocks="bookends",
alpha = 0.6,
gap.width = 0.4,
cex=.75,
xw = 0.15,
cw = 0.2,
border = NA)
Using ggsankey
# transform data to how ggsankey wants it
df <- metadata %>%
make_long(taxonomic_group, global_IUCN, popsize_data)
# Sankey
ggplot(df, aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
fill = factor(node),
label = node)) +
geom_sankey(flow.alpha = 0.6,
node.color = NA,
show.legend = FALSE) +
# manually set flow fill
# gradient of soft colors for taxonomic groups
scale_fill_manual(values= c(taxoncolors,
# iucn color codes
IUCNcolors,
# traffic light for pop data
c("darkolivegreen", "darkgoldenrod1", "brown3")),
breaks=c(levels(as.factor(metadata_firstmulti$taxonomic_group)),
levels(as.factor(metadata_firstmulti$global_IUCN)),
unique(metadata_firstmulti$popsize_data))) +
geom_sankey_label(size = 2, color = 1, fill = "white") +
theme_sankey(base_size = 10) +
xlab("")
The following plots consider the whole dataset, ie including taxa that were assessed more than once (because they could have been analysed using different methods to define populations)
# reformat data
foralluvial<-metadata %>% group_by(country_assessment, defined_populations_simplified, taxonomic_group) %>%
summarise(n=n())
## plot
# define colors
my_cols<- simplifiedmethods_colors
# we need a vector of colors by country for each row of the dataset, so:
methodspop<-as.factor(foralluvial$defined_populations_simplified)
levels(methodspop)<-my_cols
methodspop<-as.vector(methodspop)
head(methodspop)
## [1] "#668cd1" "#668cd1" "#668cd1" "#668cd1" "#668cd1" "#45c097"
# plot
alluvial(foralluvial[,1:3], freq = foralluvial$n,
col=methodspop,
blocks=FALSE,
gap.width = 0.5,
cex=.8,
xw = 0.1,
cw = 0.2,
border = NA,
alpha = .7)
Same only country and method:
# plot
alluvial(foralluvial[,1:2], freq = foralluvial$n,
col=methodspop,
blocks=FALSE,
gap.width = 0.5,
cex=.8,
xw = 0.1,
cw = 0.2,
border = NA,
alpha = .7)
Sankey just becasue why not:
# transform data to how ggsankey wants it
df <- metadata %>%
make_long(country_assessment, defined_populations_simplified, taxonomic_group)
# Sankey plot
ggplot(df, aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
fill = factor(node),
label = node)) +
geom_sankey(flow.alpha = 0.6,
node.color = NA,
show.legend = FALSE) +
geom_sankey_label(size = 2, color = 1, fill = "white") +
# colour by country
# a nice color for each of the n countries
scale_fill_manual(values=c(scales::hue_pal()(length(unique(metadata$country_assessment))),
rep("darkgrey", length(unique(metadata$defined_populations_simplified))),
taxoncolors),
breaks=c(unique(metadata$country_assessment),
levels(as.factor(ind2_data$defined_populations_simplified)),
levels(as.factor(ind2_data$taxonomic_group)))) +
theme_sankey(base_size = 10) +
xlab("")
Population size data may come from different methods for each population within a single taxon. For example, some populations can have Ne estimates, other Nc and others a range. Examples:
ind1_data %>%
filter(taxon=="Alces alces") %>%
select(country_assessment, taxon, population, Name, Ne, NcRange)
ind1_data %>%
filter(taxon=="Alces alces") %>%
select(country_assessment, taxon, population, Name, NcPoint, NcRange)
Also, for some taxa there may be population size data for some populations, but not all. Therefore indicator 1 would be computed with less populations than the total number of populations. Example (see pop3, 4, 13, 15):
ind1_data %>%
filter(taxon=="Juniperus monticola") %>%
select(country_assessment, taxon, population, Name, Ne, NcPoint, NcRange)
We need to keep the former in mind for interpretation and discussion of how the indicator can change in future assessments if data becomes available for populations currently missing.
How many of the 2549 populations have Ne, Nc or range data?
Ne?
sum(!is.na(ind1_data$Ne))
## [1] 174
Nc point?
sum(!is.na(ind1_data$NcPoint))
## [1] 636
Nc range?
sum(!is.na(ind1_data$NcRange))
## [1] 1460
Ne data by population
ind1_data %>%
ggplot(aes(x=country_assessment, fill=is.na(Ne)))+
geom_bar(position = "dodge") +
coord_flip() +
scale_fill_manual(labels=c("no", "yes"),
breaks=c("TRUE", "FALSE"),
values=c("brown", "darkgreen")) +
labs(fill="Population has Ne data?") +
xlab("") +
theme_bw() +
theme(text = element_text(size = 17), legend.position = "bottom", panel.border = element_blank())
Nc data type by population
ind1_data %>%
filter(!is.na(NcType)) %>%
ggplot(aes(x=country_assessment, fill=NcType))+
geom_bar(position = "dodge") +
coord_flip() +
scale_fill_discrete(labels=c("Nc_point", "Nc_range", "Unknown Nc for this population"),
breaks=c("Nc_point", "Nc_range", "unknown")) +
xlab("") +
theme_bw() +
theme(text = element_text(size = 17), legend.position = "bottom", panel.border = element_blank()) +
labs(fill="") +
ggtitle("Type of Nc data by population")
By taxa
Nc data availalbe by taxa?
metadata %>%
filter(!is.na(nc_pops_exists)) %>%
ggplot(aes(x=country_assessment, fill=nc_pops_exists))+
geom_bar() +
coord_flip() +
scale_fill_manual(values=c("brown", "darkgreen")) +
labs(fill="Has Nc data for indicator 1?") +
xlab("") +
ggtitle("Has Nc data?") +
theme_bw() +
theme(text = element_text(size = 17), legend.position = "bottom", panel.border = element_blank())
Ne data and genetic data available?
metadata %>%
filter(!is.na(ne_pops_exists)) %>%
ggplot(aes(x=country_assessment, fill=ne_pops_exists)) +
geom_bar(position = "dodge") +
scale_fill_manual(labels=c("no", "yes", "no, but has genetic data"),
breaks=c("no_genetic_data", "ne_available", "other_genetic_info"),
values=c("brown", "darkgreen", "grey")) +
coord_flip() +
theme_bw() +
theme(text = element_text(size = 17), legend.position = "bottom", panel.border = element_blank()) +
ggtitle("Has Ne data?")
Ne data yes or not?
metadata %>%
filter(!is.na(ne_pops_exists)) %>%
filter(ne_pops_exists!="other_genetic_info") %>%
ggplot(aes(x=country_assessment, fill=ne_pops_exists)) +
geom_bar() +
scale_fill_manual(labels=c("no", "yes"),
breaks=c("no_genetic_data", "ne_available"),
values=c("brown", "darkgreen")) +
coord_flip() +
xlab("") +
labs(fill="Has Ne data for indicator 1?") +
theme_bw() +
theme(text = element_text(size = 17), legend.position = "bottom", panel.border = element_blank()) +
ggtitle("Has Ne data?")
How is Ne data distributed?
summary(ind1_data$Ne)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.9 41.0 150.0 8782.2 639.0 500000.0 2375
Boxplot of Ne values:
ind1_data %>%
ggplot(aes(x=country_assessment, y=Ne)) +
geom_boxplot()
Check outliers (Ne very high):
ind1_data %>%
filter(Ne > 100000) %>%
select(country_assessment, name_assessor, taxon, taxonomic_group, Ne, NeLower, NeUpper, multiassessment, population)
Boxplot filtering outliers (Ne)
ind1_data %>% filter(Ne < 100000) %>%
ggplot(aes(x=country_assessment, y=Ne)) +
geom_boxplot()
Get function to transform NcRange and NcPoint data into Ne.
# check what the custom funciton does
transform_to_Ne
## function (ind1_data)
## {
## ind1_data = ind1_data
## ind1_data <- ind1_data %>% mutate(Nc_from_range = case_when(NcRange ==
## "more_5000_bymuch" ~ 5001, NcRange == "more_5000" ~ 5001,
## NcRange == "less_5000_bymuch" ~ 100, NcRange == "less_5000" ~
## 100, NcRange == "range_includes_5000" ~ 5001)) %>%
## mutate(Ne_from_Nc = case_when(!is.na(NcPoint) ~ NcPoint *
## 0.1, !is.na(Nc_from_range) ~ Nc_from_range * 0.1)) %>%
## mutate(Ne_combined = if_else(is.na(Ne), Ne_from_Nc, Ne))
## print(ind1_data)
## }
Use function to get Ne data from NcRange or NcPoint data, and their combination (Ne estimated from Ne if Ne is available, otherwise, from Nc)
ind1_data<-transform_to_Ne(ind1_data = ind1_data)
## # A tibble: 2,549 × 38
## country_assessme… taxonomic_group taxon scientific_auth… genus year_assesment
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 sweden mammal Alce… (Linnaeus, 1758) Alces 2023
## 2 sweden mammal Alce… (Linnaeus, 1758) Alces 2023
## 3 sweden mammal Alce… (Linnaeus, 1758) Alces 2023
## 4 sweden fish Silu… (Linnaeus, 1758) Silu… 2023
## 5 sweden fish Silu… (Linnaeus, 1758) Silu… 2023
## 6 sweden fish Silu… (Linnaeus, 1758) Silu… 2023
## 7 sweden fish Silu… (Linnaeus, 1758) Silu… 2023
## 8 sweden fish Silu… (Linnaeus, 1758) Silu… 2023
## 9 sweden fish Silu… (Linnaeus, 1758) Silu… 2023
## 10 sweden bird Dend… Bechstein 1803 Dend… 2022
## # … with 2,539 more rows, and 32 more variables: name_assessor <chr>,
## # email_assessor <chr>, kobo_tabular <chr>, time_populations <chr>,
## # X_validation_status <chr>, X_uuid <chr>, multiassessment <chr>,
## # population <chr>, Name <chr>, Origin <chr>, IntroductionYear <chr>,
## # Ne <dbl>, NeLower <dbl>, NeUpper <dbl>, NeYear <chr>, GeneticMarkers <chr>,
## # GeneticMarkersOther <chr>, MethodNe <chr>, SourceNe <chr>, NcType <chr>,
## # NcYear <chr>, NcMethod <chr>, NcRange <chr>, NcRangeDetails <chr>, …
ind1_data %>%
select(taxon, population, Ne, NcPoint, NcRange, Ne_from_Nc, Ne_combined)
Get function to estimate indicator 1
# check what the custom funciton does
estimate_indicator1
## function (ind1_data)
## {
## indicator1 <- ind1_data %>% group_by(X_uuid, ) %>% summarise(n_pops = n(),
## n_pops_Ne_data = sum(!is.na(Ne_combined)), n_pops_more_500 = sum(Ne_combined >
## 500, na.rm = TRUE), indicator1 = n_pops_more_500/n_pops_Ne_data) %>%
## left_join(metadata)
## print(indicator1)
## }
Now estimate indicator 1 :)
indicator1<-estimate_indicator1(ind1_data = ind1_data)
## # A tibble: 547 × 70
## X_uuid n_pops n_pops_Ne_data n_pops_more_500 indicator1 country_assessm…
## <chr> <int> <int> <int> <dbl> <chr>
## 1 010d85cd-5… 2 1 1 1 united_states
## 2 016d59ae-9… 1 1 0 0 mexico
## 3 019bd95f-b… 1 1 0 0 sweden
## 4 01b10b29-9… 1 1 1 1 south_africa
## 5 0301e6b3-b… 3 3 3 1 france
## 6 037d6c8f-7… 4 2 2 1 united_states
## 7 03f03179-1… 1 1 1 1 south_africa
## 8 0586b61e-7… 12 12 0 0 belgium
## 9 065a53ba-0… 1 1 0 0 south_africa
## 10 06e6bb50-3… 1 1 0 0 belgium
## # … with 537 more rows, and 64 more variables: taxonomic_group <chr>,
## # taxon <chr>, scientific_authority <chr>, genus <chr>, year_assesment <chr>,
## # name_assessor <chr>, email_assessor <chr>, common_name <chr>,
## # kobo_tabular <chr>, X_validation_status <chr>, GBIF_taxonID <int>,
## # NCBI_taxonID <chr>, national_taxonID <chr>, source_national_taxonID <chr>,
## # other_populations <chr>, time_populations <chr>, defined_populations <chr>,
## # source_definition_populations <chr>, map_populations <chr>, …
indicator1
By country
# get sample size by desired category
sample_size <- indicator1 %>%
group_by(country_assessment) %>% summarize(num=n())
# plot
indicator1 %>%
# add sampling size
left_join(sample_size) %>%
mutate(myaxis = paste0(country_assessment, " (n= ", num, ")")) %>%
# plot
ggplot(aes(x=myaxis, y=indicator1, fill=country_assessment)) +
geom_violin(width= 1, linewidth = 0) +
geom_jitter(size=.5, width = 0.1) +
xlab("") +
ylab("indicator 1 (proportion of populations Ne > 500)") +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text = element_text(size = 20))
By taxonomic group
# get sample size by desired category
sample_size <- indicator1 %>%
group_by(taxonomic_group) %>% summarize(num=n())
# plot
indicator1 %>%
# add sampling size
left_join(sample_size) %>%
mutate(myaxis = paste0(taxonomic_group, " (n= ", num, ")")) %>%
#plot
ggplot(aes(x=myaxis, y=indicator1, fill=taxonomic_group), color=NA) +
geom_violin(width=1, linewidth = 0) +
geom_jitter(size=.5, width = 0.1, color="brown") +
xlab("") +
coord_flip() +
scale_fill_manual(values= taxoncolors,
breaks=c(levels(as.factor(indicator1$taxonomic_group)))) +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text = element_text(size = 15))
By country and taxonomic group
indicator1 %>%
ggplot(aes(x=taxonomic_group, y=indicator1, fill=taxonomic_group), color=NA) +
geom_violin(width=1, linewidth = 0) +
geom_jitter(size=.5, width = 0.1, color="brown") +
xlab("") +
ylab("Indicator 1 (proportion of populations Ne > 500") +
coord_flip() +
scale_fill_manual(values= taxoncolors,
breaks=c(levels(as.factor(indicator1 $taxonomic_group)))) +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text = element_text(size = 14)) +
facet_wrap(~country_assessment, ncol = 5) +
theme(panel.spacing = unit(1.5, "lines"))
By IUCN
# get sample size by desired category
sample_size <- indicator1 %>%
group_by(global_IUCN) %>% summarize(num=n())
# plot
indicator1 %>%
# add sampling size
left_join(sample_size) %>%
mutate(myaxis = paste0(global_IUCN, " (n= ", num, ")")) %>%
# plot
ggplot(aes(x=myaxis, y=indicator1, fill=global_IUCN)) +
geom_violin(width=.7, linewidth = 0) +
geom_jitter(size=.5, width = 0.1) +
xlab("") +
coord_flip() +
scale_fill_manual(values= IUCNcolors, # iucn color codes
breaks=c(levels(as.factor(indicator1$global_IUCN)))) +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text= element_text(size=15)) +
theme(panel.spacing = unit(1.5, "lines"))
By IUCN and country
# get sample size by desired category
# plot
indicator1 %>%
# plot
ggplot(aes(x=global_IUCN, y=indicator1, fill=global_IUCN)) +
geom_violin(width=.7, linewidth = 0) +
geom_jitter(size=.5, width = 0.1) +
xlab("") +
coord_flip() +
scale_fill_manual(values= IUCNcolors, # iucn color codes
breaks=c(levels(as.factor(indicator1$global_IUCN)))) +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text= element_text(size=15)) +
facet_wrap(~country_assessment, ncol = 5) +
theme(panel.spacing = unit(1.5, "lines"))
By range
# get sample size by desired category
sample_size <- indicator1 %>%
filter(!is.na(indicator1)) %>%
group_by(species_range) %>% summarize(num=n())
# plot
indicator1 %>%
# add sampling size
left_join(sample_size) %>%
mutate(myaxis = paste0(species_range, " (n= ", num, ")")) %>%
# plot
ggplot(aes(x=myaxis, y=indicator1 , fill=species_range)) +
geom_violin(width=1, linewidth = 0) +
geom_jitter(size=.5, width = 0.1) +
xlab("") +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text= element_text(size=20))
By range and country
# plot
indicator1 %>%
# add sampling size
left_join(sample_size) %>%
mutate(myaxis = paste0(species_range, " (n= ", num, ")")) %>%
# plot
ggplot(aes(x=species_range, y=indicator1 , fill=species_range)) +
geom_violin(width=1, linewidth = 0) +
geom_jitter(size=.5, width = 0.1) +
xlab("") +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text= element_text(size=20)) +
facet_wrap(~country_assessment, ncol = 5) +
theme(panel.spacing = unit(1.5, "lines"))
Indicator 2 is the he proportion of populations within species which
are maintained. This can be estimated based on the
n_extant_populations and n_extint_populations,
as follows:
ind2_data$indicator2<- ind2_data$n_extant_populations / (ind2_data$n_extant_populations + ind2_data$n_extint_populations)
head(ind2_data$indicator2)
## [1] 1.0000000 0.5000000 0.2941176 1.0000000 0.3333333 1.0000000
See the distribution of the number of extant populations:
ind2_data %>%
ggplot(aes(x=n_extant_populations)) +
geom_histogram() +
ggtitle("histogram of extant populations")
Which taxa have more than 100 populations?
ind2_data %>%
filter(n_extant_populations>100) %>%
select(taxon, country_assessment, n_extant_populations)
Exclude outliers (>200 populations)
ind2_data %>%
filter(n_extant_populations<200) %>%
ggplot(aes(x=n_extant_populations)) +
geom_histogram() +
ggtitle("Distribution of extant populations \nexcluding outliers (>200 pops)")
How does the number of populations vary by country? (excluding outliers: >200 pops)
ind2_data %>%
filter(n_extant_populations<200) %>%
ggplot(aes(x=country_assessment, y=n_extant_populations)) +
geom_boxplot(aes(color=country_assessment)) +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none") +
ggtitle("Distribution of extant populations by country, \nexcluding outliers (>200 pops)")
And by method to define populations? (excluding outliers: >200 pops)
ind2_data %>%
filter(n_extant_populations<200) %>%
ggplot(aes(x=defined_populations, y=n_extant_populations)) +
geom_boxplot() +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank()) +
ggtitle("Distribution of extant populations by pop definition method, \nexcluding outliers (>200 pops)")
Simplified method categories for easier visualization:
ind2_data %>%
filter(n_extant_populations<200) %>%
ggplot(aes(x=defined_populations_simplified, y=n_extant_populations, color=defined_populations_simplified)) +
geom_boxplot() +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none") +
ggtitle("Distribution of extant populations by pop definition method, \nexcluding outliers (>200 pops)") +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified)))
Number of populations by taxonomic group:
ind2_data %>%
filter(n_extant_populations<200) %>%
ggplot(aes(x=taxonomic_group, y=n_extant_populations, color=taxonomic_group)) +
geom_boxplot() +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank()) +
ggtitle("Distribution of extant populations by taxonomic group, \nexcluding outliers (>200 pops)") +
scale_color_manual(values= taxoncolors,
breaks=levels(levels(as.factor(ind2_data$taxonomic_group))))
Taxonomic group and method:
ind2_data %>%
filter(n_extant_populations<200) %>%
ggplot(aes(x=taxonomic_group, y=n_extant_populations, color=taxonomic_group)) +
geom_boxplot() +
geom_jitter(size=.3, width = 0.1, color="darkred") +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank()) +
ggtitle("Distribution of extant populations by taxonomic group and method to define pops, excluding outliers (>200 pops)") +
scale_color_manual(values= taxoncolors,
breaks=levels(levels(as.factor(ind2_data$taxonomic_group)))) +
facet_wrap(defined_populations_simplified ~ .) +
xlab("")
Country and method:
ind2_data %>%
filter(n_extant_populations<200) %>%
ggplot(aes(x=defined_populations_simplified, y=n_extant_populations, color=defined_populations_simplified)) +
geom_boxplot() +
geom_jitter(size=.3, width = 0.1, color="black") +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none") +
ggtitle("Distribution of extant populations by country and \nmethod to define pops, excluding outliers (>200 pops)") +
facet_wrap(country_assessment ~ ., nrow=2) +
xlab("") +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified)))
Country and method, but with the US and Sweden in different scale
## excluding Sweden and US
p1<-ind2_data %>%
filter(n_extant_populations<50) %>%
filter(country_assessment!="sweden", country_assessment!="united_states") %>%
# plot
ggplot(aes(x=defined_populations_simplified, y=n_extant_populations, color=defined_populations_simplified)) +
geom_boxplot() +
geom_jitter(size=.3, width = 0.1, color="black") +
coord_flip() +
theme_bw() + ylab("") +
theme(panel.border = element_blank(), legend.position="none") +
ggtitle("Distribution of extant populations by country and method to define pops, excluding outliers (>200 pops)") +
facet_wrap(country_assessment ~ ., nrow=2) +
xlab("") +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified)))
## With Sweden and US
p2<-ind2_data %>%
filter(n_extant_populations<200) %>%
filter(country_assessment %in% c("sweden", "united_states")) %>%
# plot
ggplot(aes(x=defined_populations_simplified, y=n_extant_populations, color=defined_populations_simplified)) +
geom_boxplot() +
geom_jitter(size=.3, width = 0.1, color="black") +
coord_flip() +
theme_bw() + xlab("") + ylab("Number of extant populations") +
theme(panel.border = element_blank(), legend.position="none") +
facet_wrap(country_assessment ~ ., nrow=1, ncol=2) +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified)))
## plot together
plot_grid(p1, p2, nrow=2, rel_heights = c(2,1))
Number of populations by taxonomic group and range type:
# add species range metadata
ind2_data$species_range <- metadata$species_range
ind2_data %>%
filter(n_extant_populations<200) %>%
ggplot(aes(x=taxonomic_group, y=n_extant_populations, color=species_range)) +
geom_boxplot() +
geom_jitter(size=.3, position = position_jitterdodge()) +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank()) +
ggtitle("Distribution of extant populations by taxonomic group and range, \nexcluding outliers (>200 pops)")
Number of populations by taxonomic group and global IUCN:
# add species range metadata
ind2_data$global_IUCN <- metadata$global_IUCN
ind2_data %>%
filter(n_extant_populations<200) %>%
ggplot(aes(x=taxonomic_group, y=n_extant_populations, col=global_IUCN)) +
geom_boxplot() +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank()) +
ggtitle("Distribution of extant populations by taxonomic group and global IUCN, \nexcluding outliers (>200 pops)") +
scale_color_manual(values= IUCNcolors, # iucn color codes
breaks=c(levels(as.factor(ind2_data$global_IUCN))))
One-way ANOVA for the effect of the method to define populations on the number of extant pops, removing the extreme outlier (>1,000 pops)
# subset data without massive outlier
ind2_data_anova<- ind2_data %>%
filter(n_extant_populations<1000)
# summary of n per variable
table(ind2_data_anova$defined_populations_simplified)
##
## adaptive_traits management_units
## 19
## eco_biogeo_proxies
## 41
## genetic_clusters
## 105
## genetic_clusters eco_biogeo_proxies
## 19
## genetic_clusters geographic_boundaries
## 74
## geographic_boundaries
## 272
## geographic_boundaries adaptive_traits
## 32
## geographic_boundaries eco_biogeo_proxies
## 74
## geographic_boundaries management_units
## 25
## low_freq_combinations
## 101
## management_units
## 127
## other
## 13
# One way ANOVA method
res.anova.extant<-aov(n_extant_populations ~ defined_populations_simplified, data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = n_extant_populations ~ defined_populations_simplified,
## data = ind2_data_anova)
##
## Terms:
## defined_populations_simplified Residuals
## Sum of Squares 34724.1 1798790.6
## Deg. of Freedom 11 890
##
## Residual standard error: 44.95679
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## defined_populations_simplified 11 34724 3157 1.562 0.105
## Residuals 890 1798791 2021
Same One-way ANOVA for the effect of the method to define populations on the number of extant pops, but removing outliers >200 pops
# subset data without outliers
ind2_data_anova<- ind2_data %>%
filter(n_extant_populations<200)
# summary of n per variable
table(ind2_data_anova$defined_populations_simplified)
##
## adaptive_traits management_units
## 19
## eco_biogeo_proxies
## 40
## genetic_clusters
## 105
## genetic_clusters eco_biogeo_proxies
## 19
## genetic_clusters geographic_boundaries
## 72
## geographic_boundaries
## 268
## geographic_boundaries adaptive_traits
## 32
## geographic_boundaries eco_biogeo_proxies
## 73
## geographic_boundaries management_units
## 25
## low_freq_combinations
## 100
## management_units
## 126
## other
## 13
# One way ANOVA method
res.anova.extant<-aov(n_extant_populations ~ defined_populations_simplified, data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = n_extant_populations ~ defined_populations_simplified,
## data = ind2_data_anova)
##
## Terms:
## defined_populations_simplified Residuals
## Sum of Squares 17687.1 426965.7
## Deg. of Freedom 11 880
##
## Residual standard error: 22.02699
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## defined_populations_simplified 11 17687 1607.9 3.314 0.000176 ***
## Residuals 880 426966 485.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
One-way ANOVA for the effect of the country on the number of extant pops, removing outliers >200 pops
# subset data without outliers
ind2_data_anova<- ind2_data %>%
filter(n_extant_populations<200)
# summary of n per variable
table(ind2_data_anova$country_assessment)
##
## australia belgium colombia france japan
## 85 111 47 71 50
## mexico south_africa sweden united_states
## 95 121 121 191
# One way ANOVA method
res.anova.extant<-aov(n_extant_populations ~ country_assessment, data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = n_extant_populations ~ country_assessment, data = ind2_data_anova)
##
## Terms:
## country_assessment Residuals
## Sum of Squares 36921.2 407731.6
## Deg. of Freedom 8 883
##
## Residual standard error: 21.48854
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## country_assessment 8 36921 4615 9.995 2.15e-13 ***
## Residuals 883 407732 462
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
One-way ANOVA for the effect of the taxonomic group on the number of extant pops, removing outliers >200 pops and taxonomic groups with too few data
# summary of n per variable
table(ind2_data$taxonomic_group)
##
## amphibian angiosperm bird bryophyte fish
## 63 233 140 5 70
## fungus gymnosperm invertebrate mammal other
## 3 19 148 140 17
## pteridophytes reptile
## 14 72
# subset data
ind2_data_anova<- ind2_data %>%
filter(n_extant_populations<200) %>%
filter(taxonomic_group %!in% c("fungus", "bryophyte", "other", "pteridophytes"))
# summary of n per variable
table(ind2_data_anova$taxonomic_group)
##
## amphibian angiosperm bird fish gymnosperm invertebrate
## 60 229 139 69 18 142
## mammal reptile
## 135 62
# One way ANOVA method
res.anova.extant<-aov(n_extant_populations ~ taxonomic_group, data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = n_extant_populations ~ taxonomic_group, data = ind2_data_anova)
##
## Terms:
## taxonomic_group Residuals
## Sum of Squares 23890.3 417880.0
## Deg. of Freedom 7 846
##
## Residual standard error: 22.22494
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## taxonomic_group 7 23890 3413 6.909 5.17e-08 ***
## Residuals 846 417880 494
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
Two-way ANOVA with interaction effect of the method to define populations and the country, removing outliers >200 pops Question: is interaction he correct model? or should it be additive?
# subset data without outliers
ind2_data_anova<- ind2_data %>%
filter(n_extant_populations<200)
# summary of n per variable
ind2_data_anova %>%
group_by(country_assessment, defined_populations_simplified) %>% summarise(n=n())
# Two-way ANOVA with interaction effect of the method to define populations and the country
res.anova.extant<-aov(n_extant_populations ~ defined_populations_simplified * country_assessment , data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = n_extant_populations ~ defined_populations_simplified *
## country_assessment, data = ind2_data_anova)
##
## Terms:
## defined_populations_simplified country_assessment
## Sum of Squares 17687.1 26099.2
## Deg. of Freedom 11 8
## defined_populations_simplified:country_assessment Residuals
## Sum of Squares 26752.3 374114.1
## Deg. of Freedom 46 826
##
## Residual standard error: 21.28198
## 42 out of 108 effects not estimable
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value
## defined_populations_simplified 11 17687 1608 3.550
## country_assessment 8 26099 3262 7.203
## defined_populations_simplified:country_assessment 46 26752 582 1.284
## Residuals 826 374114 453
## Pr(>F)
## defined_populations_simplified 6.79e-05 ***
## country_assessment 2.95e-09 ***
## defined_populations_simplified:country_assessment 0.101
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
Two-way ANOVA with interaction effect of the method to define populations and the country, but keeping only groups with enough n
# variables with enough n
enough_n<-ind2_data %>%
group_by(country_assessment, defined_populations_simplified) %>%
summarise(n=n()) %>%
filter(n>=15)
## `summarise()` has grouped output by 'country_assessment'. You can override
## using the `.groups` argument.
# subset data without outliers and with enough n
ind2_data_anova<- ind2_data %>%
filter(n_extant_populations<200) %>%
# this gives the country
filter(country_assessment==unique(enough_n$country_assessment)[1] &
#this gives the methods for that country (the last [[1]] is to get the results out of a list)
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[1], 2][[1]] |
# the same for rest of countries. Notice the use of & for methods within country and | to change to other country
country_assessment==unique(enough_n$country_assessment)[2] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[2], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[3] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[3], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[4] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[4], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[5] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[5], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[6] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[6], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[7] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[7], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[8] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[8], 2][[1]])
# summary of n per variable
ind2_data_anova %>%
group_by(country_assessment, defined_populations_simplified) %>% summarise(n=n())
## `summarise()` has grouped output by 'country_assessment'. You can override
## using the `.groups` argument.
# Two-way ANOVA with interaction effect of the method to define populations and the country
res.anova.extant<-aov(n_extant_populations ~ defined_populations_simplified * country_assessment , data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = n_extant_populations ~ defined_populations_simplified *
## country_assessment, data = ind2_data_anova)
##
## Terms:
## defined_populations_simplified country_assessment
## Sum of Squares 5564.08 5584.38
## Deg. of Freedom 7 4
## defined_populations_simplified:country_assessment Residuals
## Sum of Squares 328.40 138849.53
## Deg. of Freedom 3 509
##
## Residual standard error: 16.51632
## 49 out of 64 effects not estimable
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value
## defined_populations_simplified 7 5564 794.9 2.914
## country_assessment 4 5584 1396.1 5.118
## defined_populations_simplified:country_assessment 3 328 109.5 0.401
## Residuals 509 138850 272.8
## Pr(>F)
## defined_populations_simplified 0.005365 **
## country_assessment 0.000475 ***
## defined_populations_simplified:country_assessment 0.752137
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
See the distribution of the number of extinct populations:
ind2_data %>%
ggplot(aes(x=n_extint_populations)) +
geom_histogram() +
ggtitle("histogram of extinct populations")
Exclude outliers (>200 populations)
ind2_data %>%
filter(n_extint_populations<200) %>%
ggplot(aes(x=n_extint_populations)) +
geom_histogram() +
ggtitle("Distribution of extinct populations \nexcluding outliers (>200 pops)")
How does the number of populations vary by country? (excluding outliers: >200 pops)
ind2_data %>%
filter(n_extint_populations<200) %>%
ggplot(aes(x=country_assessment, y=n_extint_populations)) +
geom_boxplot(aes(color=country_assessment)) +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none") +
ggtitle("Distribution of extinct populations by country, \nexcluding outliers (>200 pops)")
by method to define populations? (excluding outliers: >200 pops) with simplified method categories for easier visualization:
ind2_data %>%
filter(n_extint_populations<200) %>%
ggplot(aes(x=defined_populations_simplified, y=n_extint_populations, color=defined_populations_simplified)) +
geom_boxplot() +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none") +
ggtitle("Distribution of extinct populations by pop definition method, \nexcluding outliers (>200 pops)") +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified)))
Number of populations by taxonomic group:
ind2_data %>%
filter(n_extint_populations<200) %>%
ggplot(aes(x=taxonomic_group, y=n_extint_populations, color=taxonomic_group)) +
geom_boxplot() +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank()) +
ggtitle("Distribution of extinct populations by taxonomic group, \nexcluding outliers (>200 pops)") +
scale_color_manual(values= taxoncolors,
breaks=levels(levels(as.factor(ind2_data$taxonomic_group))))
Taxonomic group and method:
ind2_data %>%
filter(n_extint_populations<200) %>%
ggplot(aes(x=taxonomic_group, y=n_extint_populations, color=taxonomic_group)) +
geom_boxplot() +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank()) +
ggtitle("Distribution of extinct populations by taxonomic group and \nmethod to define pops, excluding outliers (>200 pops)") +
scale_color_manual(values= taxoncolors,
breaks=levels(levels(as.factor(ind2_data$taxonomic_group)))) +
facet_wrap(defined_populations_simplified ~ .) +
xlab("")
Country and method:
ind2_data %>%
filter(n_extint_populations<200) %>%
ggplot(aes(x=defined_populations_simplified, y=n_extint_populations, color=defined_populations_simplified)) +
geom_boxplot() +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none") +
ggtitle("Distribution of extinct populations by country and \nmethod to define pops, excluding outliers (>200 pops)") +
facet_wrap(country_assessment ~ ., nrow=2) +
xlab("") +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified)))
Number of populations by taxonomic group and range type:
# add species range metadata
ind2_data$species_range <- metadata$species_range
ind2_data %>%
filter(n_extint_populations<200) %>%
ggplot(aes(x=taxonomic_group, y=n_extint_populations, color=species_range)) +
geom_boxplot() +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank()) +
ggtitle("Distribution of extinct populations by taxonomic group and range, \nexcluding outliers (>200 pops)")
Number of populations by taxonomic group and global IUCN:
# add species range metadata
ind2_data$global_IUCN <- metadata$global_IUCN
ind2_data %>%
filter(n_extint_populations<200) %>%
ggplot(aes(x=taxonomic_group, y=n_extint_populations, col=global_IUCN)) +
geom_boxplot() +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank()) +
ggtitle("Distribution of extinct populations by taxonomic group and global IUCN, \nexcluding outliers (>200 pops)") +
scale_color_manual(values= IUCNcolors, # iucn color codes
breaks=c(levels(as.factor(ind2_data$global_IUCN))))
By method
ind2_data %>%
# filter outliers with too many pops
filter(n_extant_populations<200) %>%
# plot
ggplot(aes(x=n_extant_populations, y=n_extint_populations, color=defined_populations_simplified)) +
geom_point() +
theme_bw() +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified)))
By method. Sweden and US separately because they have too many pops.
## excluding Sweden and US
p1 <- ind2_data %>%
# filter outliers with too many pops and desired countries
filter(n_extant_populations<50) %>%
filter(country_assessment!="sweden", country_assessment!="united_states") %>%
# plot
ggplot(aes(x=n_extant_populations, y=n_extint_populations, color=defined_populations_simplified)) +
geom_point() + theme_bw() + xlab("") +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified))) + facet_wrap(country_assessment ~ ., nrow=2) +
theme(panel.border = element_blank(), legend.position="none")
## With Sweden and US
p2<-ind2_data %>%
# filter outliers with too many pops and desired countries
filter(n_extant_populations<200) %>%
filter(country_assessment %in% c("sweden", "united_states")) %>%
# plot
ggplot(aes(x=n_extant_populations, y=n_extint_populations, color=defined_populations_simplified)) +
geom_point() + theme_bw() +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified))) + facet_wrap(country_assessment ~ ., nrow=1, ncol=2) +
theme(panel.border = element_blank(), legend.position="bottom")
## plot together
plot_grid(p1, p2, nrow=2, rel_heights = c(1.2,1))
By risk status, zooming in to fewer n of pops. Sweden and US separately
because they have too many pops.
## excluding Sweden and US
p1 <- ind2_data %>%
# filter outliers with too many pops and desired countries
filter(n_extant_populations<50) %>%
filter(country_assessment!="sweden", country_assessment!="united_states") %>%
# plot
ggplot(aes(x=n_extant_populations, y=n_extint_populations, color=global_IUCN)) +
geom_point() + theme_bw() + xlab("") +
scale_color_manual(values=IUCNcolors,
breaks=levels(as.factor(ind2_data$global_IUCN))) + facet_wrap(country_assessment ~ ., nrow=2) +
theme(panel.border = element_blank(), legend.position="none")
## With Sweden and US
p2<-ind2_data %>%
# filter outliers with too many pops and desired countries
filter(n_extant_populations<200) %>%
filter(country_assessment %in% c("sweden", "united_states")) %>%
# plot
ggplot(aes(x=n_extant_populations, y=n_extint_populations, color=global_IUCN)) +
geom_point() + theme_bw() +
scale_color_manual(values=IUCNcolors,
breaks=levels(as.factor(ind2_data$global_IUCN))) + facet_wrap(country_assessment ~ ., nrow=1, ncol=2) +
theme(panel.border = element_blank(), legend.position="bottom")
## plot together
plot_grid(p1, p2, nrow=2, rel_heights = c(1.2,1))
We have NA because in some cases the number of extinct populations is unknown, therefore the above operation cannot be computed.
Total records with NA in extant populations:
sum(is.na(ind2_data$n_extant_populations))
## [1] 21
Which are?
ind2_data %>%
filter(is.na(n_extant_populations)) %>%
select(country_assessment, taxonomic_group, taxon, n_extant_populations, n_extint_populations)
Total taxa with NA in extinct populations:
sum(is.na(ind2_data$n_extint_populations))
## [1] 359
Do taxa with NA for extant also have NA for extinct?
ind2_data$taxon[is.na(ind2_data$n_extant_populations)] %in% ind2_data$taxon[is.na(ind2_data$n_extint_populations)]
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE
So out of the 924, we have 359 records with NA in n_extinct and 21 records with NA in n_extant. Of them, 21 have NA in both n_extant and n_extinct.
QUESTION: should we manually check that NA are correct in both extinct or extant pops? (the cleaning script only chekcs for 0, not NAs)
So in total there are 359 records where there are NA in either n_extant or n_extinct, which is 38.85% of total number of records. Therefore when estimating indicator 2… QUESTION: What should we do: A) we can’t estimate indicator 2 in those species, or B) we assume n_extinct = NA = 0, and therefore indicator 2 = 1.
By country
ind2_data %>%
ggplot(aes(x=country_assessment, fill=is.na(indicator2)))+
geom_bar(position = "dodge") +
coord_flip() +
scale_fill_discrete(labels=c("number of population known", "has missing data")) +
labs(fill="Indicator 2 data") +
xlab("")
By taxonomic group
ind2_data %>%
ggplot(aes(x=taxonomic_group, fill=is.na(indicator2)))+
geom_bar(position = "dodge") +
coord_flip()+
scale_fill_discrete(labels=c("number of populations known", "has missing data")) +
labs(fill="Indicator 2 data") +
xlab("")
By method to define pops
ind2_data %>%
ggplot(aes(x=defined_populations_simplified, fill=is.na(indicator2)))+
geom_bar(position = "dodge") +
coord_flip()+
scale_fill_discrete(labels=c("number of populations known", "has missing data")) +
labs(fill="Indicator 2 data") +
xlab("")
By method to define pops and country
ind2_data %>%
ggplot(aes(x=defined_populations_simplified, fill=is.na(indicator2)))+
geom_bar(position = "dodge") +
coord_flip()+
scale_fill_discrete(labels=c("number of populations known", "has missing data")) +
labs(fill="Indicator 2 data") +
xlab("") + facet_wrap(country_assessment ~., nrow = 2) +
theme(panel.border = element_blank(), legend.position="bottom")
By taxonomic group and country
ind2_data %>%
ggplot(aes(x=taxonomic_group, fill=is.na(indicator2)))+
geom_bar(position = "dodge") +
coord_flip()+
scale_fill_discrete(labels=c("number of populations known", "has missing data")) +
labs(fill="Indicator 2 data") +
xlab("") + facet_wrap(country_assessment ~., nrow = 2) +
theme(panel.border = element_blank(), legend.position="right")
Some taxa were assessed more than once to account for, for example, different ways in how to delimit populations. Create a subset of them, excluding those records with missing data in indicator2 (due to missing data in n_pops).
#subset only with taxa assessed multiple times:
ind2_data_multi<-ind2_data %>%
filter(multiassessment=="multiassessment")
In total there are 93 multiassessed records, of 44 taxa. Notice that this can include missing data in the number of populations, hence not allowing to estimate indicator 2.
To be able to visualize the missing data, the following plot changes NA to -1. Variation in the number of extant populations by assessment
# plot
ind2_data_multi %>%
mutate(n_extant_populations=ifelse(is.na(n_extant_populations), -1, n_extant_populations)) %>%
ggplot(aes(x=taxon, y=n_extant_populations)) +
geom_line(colour="darkgrey") +
geom_point(aes(color=country_assessment)) +
xlab("") +
coord_flip() +
ggtitle("Variation in the number of extant populations, \nfor multiassessed taxa") +
theme_bw() +
theme(legend.position = "bottom")
Same plot, but excluding Bombus terricola’s massive variation:
# plot
ind2_data_multi %>%
filter(taxon!="Bombus terricola") %>%
mutate(n_extant_populations=ifelse(is.na(n_extant_populations), -1, n_extant_populations)) %>%
ggplot(aes(x=taxon, y=n_extant_populations)) +
geom_line(colour="darkgrey") +
geom_point(aes(color=country_assessment)) +
xlab("") +
coord_flip() +
ggtitle("Variation in the number of extant populations, \nfor multiassessed taxa") +
theme_bw() +
theme(legend.position = "bottom")
Now for extinct populations (NA transformed as -1 for visualization purposes):
# plot
ind2_data_multi %>%
mutate(n_extint_populations=ifelse(is.na(n_extint_populations), -1, n_extint_populations)) %>%
ggplot(aes(x=taxon, y=n_extint_populations)) +
geom_line(colour="darkgrey") +
geom_point(aes(color=country_assessment)) +
xlab("") +
coord_flip() +
ggtitle("Variation in the number of extinct populations, \nfor multiassessed taxa") +
theme_bw() +
theme(legend.position = "bottom")
See you later, Bombus terricola
# plot
ind2_data_multi %>%
filter(taxon!="Bombus terricola") %>%
mutate(n_extint_populations=ifelse(is.na(n_extint_populations), -1, n_extint_populations)) %>%
ggplot(aes(x=taxon, y=n_extint_populations)) +
geom_line(colour="darkgrey") +
geom_point(aes(color=country_assessment)) +
xlab("") +
coord_flip() +
ggtitle("Variation in the number of extinct populations, \nfor multiassessed taxa") +
theme_bw() +
theme(legend.position = "bottom")
This is how much the values of indicator2 vary within mutliassessed taxa (taxa names with no shown values mean they have missing data in the number of populations and hence indicator 2 can’t be estimated):
# plot
ind2_data_multi %>%
ggplot(aes(x=taxon, y=indicator2)) +
geom_line(colour="darkgrey") +
geom_point(aes(color=country_assessment)) +
xlab("") +
coord_flip() +
ggtitle("Variation in indicator 2, \nfor multiassessed taxa") +
theme_bw() +
theme(legend.position = "bottom")
For exploratory purposes, unless otherwise stated differently, the analyses below will use a subset of the data including only taxa assessed a single time, plus the first record of those assessed multiple times.
#subset keeping only taxa assessed a single time, plust the first record of those assessed multiple times.
ind2_data_firstmulti<-ind2_data[!duplicated(cbind(ind2_data$taxon, ind2_data$country_assessment)), ]
Remember, for exploratory purposes, unless otherwise stated differently, the analyses below will use a subset of the data including only taxa assessed a single time, plus the first record of those assessed multiple times.
For the taxa that do have data, this is how the values of indicator2 are distributed:
hist(ind2_data_firstmulti$indicator2)
Visualizing by country
# get sample size by desired category
sample_size <- ind2_data_firstmulti %>%
filter(!is.na(indicator2)) %>%
group_by(country_assessment) %>% summarize(num=n())
# plot
ind2_data_firstmulti %>%
# add sampling size
left_join(sample_size) %>%
mutate(myaxis = paste0(country_assessment, " (n= ", num, ")")) %>%
# plot
ggplot(aes(x=myaxis, y=indicator2, fill=country_assessment)) +
geom_violin(width= 1, linewidth = 0) +
geom_jitter(size=.5, width = 0.1) +
xlab("") +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none")
Visualizing by taxonomic group:
# get sample size by desired category
sample_size <- ind2_data_firstmulti %>%
filter(!is.na(indicator2)) %>%
group_by(taxonomic_group) %>% summarize(num=n())
# plot
ind2_data_firstmulti %>%
# add sampling size
left_join(sample_size) %>%
mutate(myaxis = paste0(taxonomic_group, " (n= ", num, ")")) %>%
#plot
ggplot(aes(x=myaxis, y=indicator2, fill=taxonomic_group), color=NA) +
geom_violin(width=2, linewidth = 0) +
geom_jitter(size=.5, width = 0.1, color="brown") +
xlab("") +
coord_flip() +
scale_fill_manual(values= taxoncolors,
breaks=c(levels(as.factor(ind2_data_firstmulti$taxonomic_group)))) +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text = element_text(size = 15))
Same boxplot
# get sample size by desired category
sample_size <- ind2_data_firstmulti %>%
filter(!is.na(indicator2)) %>%
group_by(taxonomic_group) %>% summarize(num=n())
ind2_data_firstmulti %>%
# add sampling size
left_join(sample_size) %>%
mutate(myaxis = paste0(taxonomic_group, " (n= ", num, ")")) %>%
#plot
ggplot(aes(x=myaxis, y=indicator2, color=taxonomic_group)) +
geom_boxplot() +
geom_jitter(size=.5, width = 0.1, color="brown") +
xlab("") +
coord_flip() +
scale_color_manual(values= taxoncolors,
breaks=c(levels(as.factor(ind2_data_firstmulti$taxonomic_group)))) +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text = element_text(size = 15))
Zoom in to invertebrates by country:
ind2_data_firstmulti %>%
filter(!is.na(indicator2)) %>%
filter(taxonomic_group=="invertebrate") %>%
ggplot(aes(x=country_assessment, y=indicator2, fill=country_assessment), color=NA) +
geom_violin() +
geom_jitter(size=.7, width = 0.1, color="black") +
xlab("") +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text = element_text(size = 15)) +
ggtitle("Distribution of Indicator 2 for Invertebrates")
Visualizing by IUCN:
# add IUCN data form metadata to ind2
ind2_data_firstmulti$global_IUCN <- metadata_firstmulti$global_IUCN
# get sample size by desired category
sample_size <- ind2_data_firstmulti %>%
filter(!is.na(indicator2)) %>%
group_by(global_IUCN) %>% summarize(num=n())
# plot
ind2_data_firstmulti %>%
# add sampling size
left_join(sample_size) %>%
mutate(myaxis = paste0(global_IUCN, " (n= ", num, ")")) %>%
# plot
ggplot(aes(x=myaxis, y=indicator2, fill=global_IUCN)) +
geom_violin(width=1, linewidth = 0) +
geom_jitter(size=.5, width = 0.1) +
xlab("") +
coord_flip() +
scale_fill_manual(values= IUCNcolors, # iucn color codes
breaks=c(levels(as.factor(ind2_data_firstmulti$global_IUCN)))) +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text= element_text(size=15))
Visualizing by range type:
# add data form metadata to ind2
ind2_data_firstmulti$species_range <- metadata_firstmulti$species_range
# get sample size by desired category
sample_size <- ind2_data_firstmulti %>%
filter(!is.na(indicator2)) %>%
group_by(species_range) %>% summarize(num=n())
# plot
ind2_data_firstmulti %>%
# add sampling size
left_join(sample_size) %>%
mutate(myaxis = paste0(species_range, " (n= ", num, ")")) %>%
# plot
ggplot(aes(x=myaxis, y=indicator2, fill=species_range)) +
geom_violin(width=1, linewidth = 0) +
geom_jitter(size=.5, width = 0.1) +
xlab("") +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text= element_text(size=20))
Visualizing by rarity:
# add data form metadata to ind2
ind2_data_firstmulti$rarity <- metadata_firstmulti$rarity
# get sample size by desired category
sample_size <- ind2_data_firstmulti %>%
filter(!is.na(indicator2)) %>%
group_by(rarity) %>% summarize(num=n())
# plot
ind2_data_firstmulti %>%
# add sampling size
left_join(sample_size) %>%
mutate(myaxis = paste0(rarity, " (n= ", num, ")")) %>%
# plot
ggplot(aes(x=myaxis, y=indicator2, fill=rarity)) +
geom_violin(width=1, linewidth = 0) +
geom_jitter(size=.5, width = 0.1) +
xlab("") +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text= element_text(size=20))
By population method, boxplot version:
# get sample size by desired category
sample_size <- ind2_data_firstmulti %>%
filter(!is.na(indicator2)) %>%
group_by(defined_populations_simplified) %>% summarize(num=n())
# plot
ind2_data_firstmulti %>%
# add sampling size
left_join(sample_size) %>%
mutate(myaxis = paste0(defined_populations_simplified, " (n= ", num, ")")) %>%
# plot
ggplot(aes(x=myaxis, y=indicator2, color=defined_populations_simplified)) +
geom_boxplot() +
geom_jitter(size=.5, width = 0.1, color="black") +
xlab("") +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none") +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified)))
Facet by country
ind2_data_firstmulti %>%
# plot
ggplot(aes(x=defined_populations_simplified, y=indicator2, color=defined_populations_simplified)) +
geom_boxplot() +
geom_jitter(size=.5, width = 0.1, color="black") +
xlab("") +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none") +
facet_wrap(~country_assessment, ncol=3) +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data_firstmulti$defined_populations_simplified)))
Facet by country and IUCN
ind2_data_firstmulti %>%
# plot
ggplot(aes(x=global_IUCN, y=indicator2, fill=global_IUCN)) +
geom_violin(width=1, linewidth = 0) +
geom_jitter(size=.5, width = 0.1) +
xlab("") +
coord_flip() +
scale_fill_manual(values= IUCNcolors, # iucn color codes
breaks=c(levels(as.factor(ind2_data_firstmulti$global_IUCN)))) +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", text= element_text(size=15)) +
facet_wrap(~country_assessment, ncol=3) +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data_firstmulti$defined_populations_simplified)))
Value of indicator 2 within a single country, by method to define populations, taxonomic group and iucn. Including only 1 assessment per multiassessed taxa.
### For loop, 3 plots (defined_populations_simplified, taxonomic and global IUCN) by country.
for(i in unique(ind2_data_firstmulti$country_assessment)){
# filter to desired country in the loop
x <-ind2_data_firstmulti %>% filter(country_assessment==i)
### for pops method
# build plot
p1<- x %>%
ggplot(aes(x=defined_populations_simplified, y=indicator2, color=defined_populations_simplified)) +
geom_boxplot() +
geom_jitter(size=.5, width = 0.1, color="black") +
coord_flip() +
xlab("") +
## manually set color
# use ind2_data_firstmulti instead of x to define number of colors and breaks
# to ensure same colors are used in all countries
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data_firstmulti$defined_populations_simplified))) +
#theme options
theme_bw() +
theme(panel.border = element_blank(), legend.position="none") +
# the plot at the top should have title and no x axis text and labels
theme(axis.text.x = element_blank()) +
ylab("") +
ggtitle(i)
### for taxonomic group
# build plot
p2<- x %>%
ggplot(aes(x=taxonomic_group, y=indicator2, color=taxonomic_group)) +
geom_boxplot() +
geom_jitter(size=.5, width = 0.1, color="black") +
coord_flip() +
xlab("") +
## manually set color
# use ind2_data_firstmulti instead of x to define number of colors and breaks
# to ensure same colors are used in all countries
scale_color_manual(values=taxoncolors,
breaks=levels(as.factor(ind2_data_firstmulti$taxonomic_group))) +
# theme options
theme_bw() +
theme(panel.border = element_blank(), legend.position="none") +
# the plot at the middle should have no x axis text and labels and no title
theme(axis.text.x = element_blank()) +
ylab("")
### for global IUCN
# build plot
p3<- x %>%
ggplot(aes(x=global_IUCN, y=indicator2, color=global_IUCN)) +
geom_boxplot() +
geom_jitter(size=.5, width = 0.1, color="black") +
coord_flip() +
xlab("") +
## manually set color
# use ind2_data_firstmulti instead of x to define number of colors and breaks
# to ensure same colors are used in all countries
scale_color_manual(values=IUCNcolors, # iucn color codes,
breaks=levels(as.factor(ind2_data_firstmulti$global_IUCN))) +
# theme options
theme_bw() +
theme(panel.border = element_blank(), legend.position="none")
# the plot at the bottom should have axis and labels
#### the 3 plots togheter
# plot_grid from cowplot helps to keep the ploting area aligned.
print(plot_grid(p1, p2, p3, ncol = 1, align = 'v'))
}
ind2_data %>%
# filter outliers with too many pops
filter(n_extant_populations<200) %>%
# plot
ggplot(aes(x=n_extant_populations, y=indicator2)) +
geom_point()
Same, colouring by country
ind2_data %>%
# filter outliers with too many pops
filter(n_extant_populations<200) %>%
# plot
ggplot(aes(x=n_extant_populations, y=indicator2, color=country_assessment)) +
geom_point() +
theme_bw()
By global IUCN risk
ind2_data$global_IUCN<-metadata$global_IUCN
ind2_data %>%
# filter outliers with too many pops
filter(n_extant_populations<200) %>%
# plot
ggplot(aes(x=n_extant_populations, y=indicator2, color=global_IUCN)) +
geom_point() +
scale_color_manual(values= IUCNcolors, # iucn color codes
breaks=c(levels(as.factor(ind2_data$global_IUCN)))) +
theme_bw()
By population definition method:
ind2_data %>%
# filter outliers with too many pops
filter(n_extant_populations<200) %>%
# plot
ggplot(aes(x=n_extant_populations, y=indicator2, color=defined_populations_simplified)) +
geom_point() +
theme_bw() +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified)))
Run model (first removing missing data)
# remove missing data
ind2_data_wo_missing<-ind2_data %>%
filter(!is.na(indicator2)) %>%
filter(n_extant_populations<500) # doesn't make a difference in the test below, but useful for plots
# check n per method
table(ind2_data_wo_missing$defined_populations_simplified)
##
## adaptive_traits management_units
## 19
## eco_biogeo_proxies
## 30
## genetic_clusters
## 50
## genetic_clusters eco_biogeo_proxies
## 12
## genetic_clusters geographic_boundaries
## 46
## geographic_boundaries
## 178
## geographic_boundaries adaptive_traits
## 32
## geographic_boundaries eco_biogeo_proxies
## 56
## geographic_boundaries management_units
## 17
## low_freq_combinations
## 69
## management_units
## 45
## other
## 9
# run model
m2 <- glm(ind2_data_wo_missing$indicator2 ~ ind2_data_wo_missing$n_extant_populations +
ind2_data_wo_missing$defined_populations_simplified +
ind2_data_wo_missing$n_extant_populations*ind2_data_wo_missing$defined_populations_simplified, family = "quasibinomial")
m2
##
## Call: glm(formula = ind2_data_wo_missing$indicator2 ~ ind2_data_wo_missing$n_extant_populations +
## ind2_data_wo_missing$defined_populations_simplified + ind2_data_wo_missing$n_extant_populations *
## ind2_data_wo_missing$defined_populations_simplified, family = "quasibinomial")
##
## Coefficients:
## (Intercept)
## 2.40748
## ind2_data_wo_missing$n_extant_populations
## 0.04605
## ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies
## -0.90120
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters
## 0.48522
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies
## 1.73254
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries
## -0.27973
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries
## -1.00348
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits
## 0.12491
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies
## -1.08795
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units
## -0.65696
## ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations
## -0.70210
## ind2_data_wo_missing$defined_populations_simplifiedmanagement_units
## -1.81268
## ind2_data_wo_missing$defined_populations_simplifiedother
## -2.49424
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies
## -0.04288
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters
## -0.12867
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies
## -0.18537
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries
## -0.05244
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries
## -0.04408
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits
## -0.03108
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies
## -0.05005
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units
## 0.05238
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations
## -0.04496
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedmanagement_units
## -0.05417
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedother
## 0.66367
##
## Degrees of Freedom: 562 Total (i.e. Null); 539 Residual
## Null Deviance: 246.2
## Residual Deviance: 214.7 AIC: NA
summary(m2)
##
## Call:
## glm(formula = ind2_data_wo_missing$indicator2 ~ ind2_data_wo_missing$n_extant_populations +
## ind2_data_wo_missing$defined_populations_simplified + ind2_data_wo_missing$n_extant_populations *
## ind2_data_wo_missing$defined_populations_simplified, family = "quasibinomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8020 -0.3222 0.3601 0.5988 0.9748
##
## Coefficients:
## Estimate
## (Intercept) 2.40748
## ind2_data_wo_missing$n_extant_populations 0.04605
## ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies -0.90120
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters 0.48522
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies 1.73254
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries -0.27973
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries -1.00348
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits 0.12491
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies -1.08795
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units -0.65696
## ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations -0.70210
## ind2_data_wo_missing$defined_populations_simplifiedmanagement_units -1.81268
## ind2_data_wo_missing$defined_populations_simplifiedother -2.49424
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies -0.04288
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters -0.12867
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies -0.18537
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries -0.05244
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries -0.04408
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits -0.03108
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies -0.05005
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units 0.05238
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations -0.04496
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedmanagement_units -0.05417
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedother 0.66367
## Std. Error
## (Intercept) 0.78714
## ind2_data_wo_missing$n_extant_populations 0.12562
## ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies 0.85777
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters 0.93775
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies 1.52683
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries 0.84635
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries 0.79812
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits 0.92914
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies 0.81582
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units 1.01984
## ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations 0.82259
## ind2_data_wo_missing$defined_populations_simplifiedmanagement_units 0.82101
## ind2_data_wo_missing$defined_populations_simplifiedother 1.22073
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies 0.12576
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters 0.16613
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies 0.13815
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries 0.12566
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries 0.12567
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits 0.13199
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies 0.12567
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units 0.20612
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations 0.12620
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedmanagement_units 0.12677
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedother 0.58195
## t value
## (Intercept) 3.059
## ind2_data_wo_missing$n_extant_populations 0.367
## ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies -1.051
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters 0.517
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies 1.135
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries -0.331
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries -1.257
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits 0.134
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies -1.334
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units -0.644
## ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations -0.854
## ind2_data_wo_missing$defined_populations_simplifiedmanagement_units -2.208
## ind2_data_wo_missing$defined_populations_simplifiedother -2.043
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies -0.341
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters -0.775
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies -1.342
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries -0.417
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries -0.351
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits -0.235
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies -0.398
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units 0.254
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations -0.356
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedmanagement_units -0.427
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedother 1.140
## Pr(>|t|)
## (Intercept) 0.00233
## ind2_data_wo_missing$n_extant_populations 0.71407
## ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies 0.29390
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters 0.60507
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies 0.25699
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries 0.74114
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries 0.20919
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits 0.89311
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies 0.18291
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units 0.51973
## ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations 0.39374
## ind2_data_wo_missing$defined_populations_simplifiedmanagement_units 0.02767
## ind2_data_wo_missing$defined_populations_simplifiedother 0.04151
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies 0.73325
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters 0.43896
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies 0.18024
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries 0.67658
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries 0.72594
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits 0.81395
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies 0.69058
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units 0.79949
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations 0.72178
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedmanagement_units 0.66933
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedother 0.25461
##
## (Intercept) **
## ind2_data_wo_missing$n_extant_populations
## ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units
## ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations
## ind2_data_wo_missing$defined_populations_simplifiedmanagement_units *
## ind2_data_wo_missing$defined_populations_simplifiedother *
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedmanagement_units
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedother
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 0.3995451)
##
## Null deviance: 246.17 on 562 degrees of freedom
## Residual deviance: 214.70 on 539 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 7
Put side by side the plots of number of populations and indicator 2 range (excluding >200 pops outlieres):
## plot for number of populations
# sample size of TOTAL populations
sample_size <- ind2_data %>%
filter(!is.na(indicator2)) %>%
group_by(defined_populations_simplified) %>% summarize(num=n())
p1<-ind2_data %>%
filter(n_extant_populations<500) %>%
# add sampling size
left_join(sample_size) %>%
mutate(myaxis = paste0(defined_populations_simplified, " (n= ", num, ")")) %>%
# plot
ggplot(aes(x=myaxis, y=n_extant_populations, color=defined_populations_simplified)) +
geom_boxplot() + xlab("") + ylab("Number of extant populations") +
geom_jitter(size=.4, width = 0.1, color="black") +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none",
plot.margin = unit(c(0, 0, 0, 0), "cm")) + # this is used to decrease the space between plots
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified))) +
theme(text = element_text(size = 15))
## plot for indicator 2
p2<-ind2_data %>%
filter(n_extant_populations<500) %>%
ggplot(aes(x=defined_populations_simplified, y=indicator2, color=defined_populations_simplified)) +
geom_boxplot() + xlab("") + ylab("Indicator 2") +
geom_jitter(size=.4, width = 0.1, color="black") +
coord_flip() +
theme_bw() +
theme(panel.border = element_blank(), legend.position="none", axis.text.y = element_blank(),
plot.margin = unit(c(0, 0, 0, 0), "cm")) + # this is used to decrease the space between plots) +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified))) +
theme(text = element_text(size = 15))
plot_grid(p1, p2, ncol=2, rel_widths = c(1.5,1))
Add the scatter plot of indicator2 and extant pops as a third pannel
p3<- ind2_data %>%
# filter outliers with too many pops
filter(n_extant_populations<500) %>%
# plot
ggplot(aes(x=n_extant_populations, y=indicator2, color=defined_populations_simplified)) +
geom_point() +
theme_bw() +
scale_color_manual(values=simplifiedmethods_colors,
breaks=levels(as.factor(ind2_data$defined_populations_simplified))) +
theme(legend.position = "none") +
ylab("Indicator 2") +
xlab("Number of extant populations") +
theme(text = element_text(size = 15))
p3
plot_grid(p1, p2, p3, ncol=3, rel_widths = c(1.7,1,1), align = "h")
One-way ANOVA for the effect of the method to define populations on indicator 2, removing the extreme outlier (>1,000 pops)
# subset data without massive outlier
ind2_data_anova<- ind2_data %>%
filter(indicator2<1000)
# summary of n per variable
table(ind2_data_anova$defined_populations_simplified)
##
## adaptive_traits management_units
## 19
## eco_biogeo_proxies
## 30
## genetic_clusters
## 50
## genetic_clusters eco_biogeo_proxies
## 12
## genetic_clusters geographic_boundaries
## 46
## geographic_boundaries
## 180
## geographic_boundaries adaptive_traits
## 32
## geographic_boundaries eco_biogeo_proxies
## 56
## geographic_boundaries management_units
## 17
## low_freq_combinations
## 69
## management_units
## 45
## other
## 9
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified, data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = indicator2 ~ defined_populations_simplified, data = ind2_data_anova)
##
## Terms:
## defined_populations_simplified Residuals
## Sum of Squares 3.349251 31.065342
## Deg. of Freedom 11 553
##
## Residual standard error: 0.2370148
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## defined_populations_simplified 11 3.349 0.30448 5.42 3.22e-08 ***
## Residuals 553 31.065 0.05618
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Same One-way ANOVA for the effect of the method to define populations on indicator 2, but removing outliers >200 pops
# subset data without outliers
ind2_data_anova<- ind2_data %>%
filter(indicator2<200)
# summary of n per variable
table(ind2_data_anova$defined_populations_simplified)
##
## adaptive_traits management_units
## 19
## eco_biogeo_proxies
## 30
## genetic_clusters
## 50
## genetic_clusters eco_biogeo_proxies
## 12
## genetic_clusters geographic_boundaries
## 46
## geographic_boundaries
## 180
## geographic_boundaries adaptive_traits
## 32
## geographic_boundaries eco_biogeo_proxies
## 56
## geographic_boundaries management_units
## 17
## low_freq_combinations
## 69
## management_units
## 45
## other
## 9
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified, data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = indicator2 ~ defined_populations_simplified, data = ind2_data_anova)
##
## Terms:
## defined_populations_simplified Residuals
## Sum of Squares 3.349251 31.065342
## Deg. of Freedom 11 553
##
## Residual standard error: 0.2370148
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## defined_populations_simplified 11 3.349 0.30448 5.42 3.22e-08 ***
## Residuals 553 31.065 0.05618
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
<script data-pagedtable-source type="application/json">
{“columns”:[{“label”:[“”],“name”:[“rn”],“type”:[“”],“align”:[“left”]},{“label”:[“diff”],“name”:[1],“type”:[“dbl”],“align”:[“right”]},{“label”:[“lwr”],“name”:[2],“type”:[“dbl”],“align”:[“right”]},{“label”:[“upr”],“name”:[3],“type”:[“dbl”],“align”:[“right”]},{“label”:[“p
adj”],“name”:[4],“type”:[“dbl”],“align”:[“right”]}],“data”:[{“1”:“-0.3019396”,“2”:“-0.5147739”,“3”:“-0.089105408”,“4”:“2.516395e-04”,“rn”:“management_units-adaptive_traits
management_units”},{“1”:“-0.2008192”,“2”:“-0.3841766”,“3”:“-0.017461895”,“4”:“1.818658e-02”,“rn”:“management_units-eco_biogeo_proxies”},{“1”:“-0.1262098”,“2”:“-0.2505688”,“3”:“-0.001850756”,“4”:“4.305017e-02”,“rn”:“geographic_boundaries-genetic_clusters”},{“1”:“-0.1564425”,“2”:“-0.3078016”,“3”:“-0.005083409”,“4”:“3.550944e-02”,“rn”:“geographic_boundaries
eco_biogeo_proxies-genetic_clusters”},{“1”:“-0.3031382”,“2”:“-0.4629854”,“3”:“-0.143291001”,“4”:“6.329219e-08”,“rn”:“management_units-genetic_clusters”},{“1”:“-0.2758263”,“2”:“-0.5285669”,“3”:“-0.023085643”,“4”:“1.900037e-02”,“rn”:“management_units-genetic_clusters
eco_biogeo_proxies”},{“1”:“-0.2430020”,“2”:“-0.4061081”,“3”:“-0.079895959”,“4”:“8.400203e-05”,“rn”:“management_units-genetic_clusters
geographic_boundaries”},{“1”:“-0.1769285”,“2”:“-0.3065817”,“3”:“-0.047275251”,“4”:“5.587294e-04”,“rn”:“management_units-geographic_boundaries”},{“1”:“-0.3024332”,“2”:“-0.4823197”,“3”:“-0.122546624”,“4”:“3.431045e-06”,“rn”:“management_units-geographic_boundaries
adaptive_traits”},{“1”:“-0.2560209”,“2”:“-0.4774831”,“3”:“-0.034558622”,“4”:“8.946065e-03”,“rn”:“management_units-geographic_boundaries
management_units”},{“1”:“-0.2169626”,“2”:“-0.3660209”,“3”:“-0.067904295”,“4”:“1.434205e-04”,“rn”:“management_units-low_freq_combinations”}],“options”:{“columns”:{“min”:{},“max”:[10]},“rows”:{“min”:[10],“max”:[10]},“pages”:{}}}
One-way ANOVA for the effect of the country on indicator 2, removing outliers >200 pops
# subset data without outliers
ind2_data_anova<- ind2_data %>%
filter(indicator2<200)
# summary of n per variable
table(ind2_data_anova$country_assessment)
##
## australia belgium colombia france japan
## 28 27 36 34 50
## mexico south_africa sweden united_states
## 28 91 125 146
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ country_assessment, data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = indicator2 ~ country_assessment, data = ind2_data_anova)
##
## Terms:
## country_assessment Residuals
## Sum of Squares 6.57625 27.83834
## Deg. of Freedom 8 556
##
## Residual standard error: 0.2237609
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## country_assessment 8 6.576 0.8220 16.42 <2e-16 ***
## Residuals 556 27.838 0.0501
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
<script data-pagedtable-source type="application/json">
{“columns”:[{“label”:[“”],“name”:[“rn”],“type”:[“”],“align”:[“left”]},{“label”:[“diff”],“name”:[1],“type”:[“dbl”],“align”:[“right”]},{“label”:[“lwr”],“name”:[2],“type”:[“dbl”],“align”:[“right”]},{“label”:[“upr”],“name”:[3],“type”:[“dbl”],“align”:[“right”]},{“label”:[“p
adj”],“name”:[4],“type”:[“dbl”],“align”:[“right”]}],“data”:[{“1”:“-0.4501955”,“2”:“-0.638140376”,“3”:“-0.262250604”,“4”:“4.566098e-10”,“rn”:“belgium-australia”},{“1”:“0.3124810”,“2”:“0.135083661”,“3”:“0.489878426”,“4”:“2.235340e-06”,“rn”:“colombia-belgium”},{“1”:“0.4010613”,“2”:“0.221441724”,“3”:“0.580680857”,“4”:“8.062411e-10”,“rn”:“france-belgium”},{“1”:“0.4714192”,“2”:“0.305005710”,“3”:“0.637832702”,“4”:“4.445180e-10”,“rn”:“japan-belgium”},{“1”:“0.4830699”,“2”:“0.295125041”,“3”:“0.671014813”,“4”:“4.447910e-10”,“rn”:“mexico-belgium”},{“1”:“0.4958742”,“2”:“0.343170891”,“3”:“0.648577540”,“4”:“4.445251e-10”,“rn”:“south_africa-belgium”},{“1”:“0.3288397”,“2”:“0.180964600”,“3”:“0.476714746”,“4”:“8.792915e-10”,“rn”:“sweden-belgium”},{“1”:“0.3531146”,“2”:“0.207140877”,“3”:“0.499088406”,“4”:“4.517717e-10”,“rn”:“united_states-belgium”},{“1”:“0.1589382”,“2”:“0.006630049”,“3”:“0.311246276”,“4”:“3.321074e-02”,“rn”:“japan-colombia”},{“1”:“0.1833932”,“2”:“0.046197636”,“3”:“0.320588708”,“4”:“1.204836e-03”,“rn”:“south_africa-colombia”},{“1”:“-0.1425795”,“2”:“-0.259176991”,“3”:“-0.025982076”,“4”:“4.877067e-03”,“rn”:“sweden-japan”},{“1”:“-0.1183046”,“2”:“-0.232481050”,“3”:“-0.004128079”,“4”:“3.565271e-02”,“rn”:“united_states-japan”},{“1”:“-0.1542303”,“2”:“-0.299917576”,“3”:“-0.008542933”,“4”:“2.865942e-02”,“rn”:“sweden-mexico”},{“1”:“-0.1670345”,“2”:“-0.263054440”,“3”:“-0.071014645”,“4”:“3.212117e-06”,“rn”:“sweden-south_africa”},{“1”:“-0.1427596”,“2”:“-0.235824731”,“3”:“-0.049694417”,“4”:“7.934629e-05”,“rn”:“united_states-south_africa”}],“options”:{“columns”:{“min”:{},“max”:[10]},“rows”:{“min”:[10],“max”:[10]},“pages”:{}}}
One-way ANOVA for the effect of the taxonomic group on indicator 2, removing outliers >200 pops and taxonomic groups with too few data
# summary of n per variable
table(ind2_data$taxonomic_group)
##
## amphibian angiosperm bird bryophyte fish
## 63 233 140 5 70
## fungus gymnosperm invertebrate mammal other
## 3 19 148 140 17
## pteridophytes reptile
## 14 72
# subset data
ind2_data_anova<- ind2_data %>%
filter(indicator2<200) %>%
filter(taxonomic_group %!in% c("fungus", "bryophyte", "other", "pteridophytes"))
# summary of n per variable
table(ind2_data_anova$taxonomic_group)
##
## amphibian angiosperm bird fish gymnosperm invertebrate
## 50 141 84 49 9 83
## mammal reptile
## 82 40
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ taxonomic_group, data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = indicator2 ~ taxonomic_group, data = ind2_data_anova)
##
## Terms:
## taxonomic_group Residuals
## Sum of Squares 3.835361 29.572753
## Deg. of Freedom 7 530
##
## Residual standard error: 0.2362153
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## taxonomic_group 7 3.835 0.5479 9.82 1.55e-11 ***
## Residuals 530 29.573 0.0558
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
Two-way ANOVA with interaction effect of the method to define populations and the country, removing outliers >200 pops Question: is interaction he correct model? or should it be additive?
# subset data without outliers
ind2_data_anova<- ind2_data %>%
filter(indicator2<200)
# summary of n per variable
ind2_data_anova %>%
group_by(country_assessment, defined_populations_simplified) %>% summarise(n=n())
# Two-way ANOVA with interaction effect of the method to define populations and the country
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified * country_assessment , data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = indicator2 ~ defined_populations_simplified * country_assessment,
## data = ind2_data_anova)
##
## Terms:
## defined_populations_simplified country_assessment
## Sum of Squares 3.349251 3.683287
## Deg. of Freedom 11 8
## defined_populations_simplified:country_assessment Residuals
## Sum of Squares 1.711872 25.670183
## Deg. of Freedom 38 507
##
## Residual standard error: 0.2250145
## 50 out of 108 effects not estimable
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value
## defined_populations_simplified 11 3.349 0.3045 6.014
## country_assessment 8 3.683 0.4604 9.093
## defined_populations_simplified:country_assessment 38 1.712 0.0450 0.890
## Residuals 507 25.670 0.0506
## Pr(>F)
## defined_populations_simplified 2.88e-09 ***
## country_assessment 1.03e-11 ***
## defined_populations_simplified:country_assessment 0.66
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
Two-way ANOVA with interaction effect of the method to define populations and the country, but keeping only groups with enough n
# variables with enough n
enough_n<-ind2_data %>%
group_by(country_assessment, defined_populations_simplified) %>%
summarise(n=n()) %>%
filter(n>=15)
## `summarise()` has grouped output by 'country_assessment'. You can override
## using the `.groups` argument.
# subset data without outliers and with enough n
ind2_data_anova<- ind2_data %>%
filter(indicator2<200) %>%
# this gives the country
filter(country_assessment==unique(enough_n$country_assessment)[1] &
#this gives the methods for that country (the last [[1]] is to get the results out of a list)
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[1], 2][[1]] |
# the same for rest of countries. Notice the use of & for methods within country and | to change to other country
country_assessment==unique(enough_n$country_assessment)[2] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[2], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[3] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[3], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[4] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[4], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[5] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[5], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[6] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[6], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[7] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[7], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[8] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[8], 2][[1]])
# summary of n per variable
ind2_data_anova %>%
group_by(country_assessment, defined_populations_simplified) %>% summarise(n=n())
## `summarise()` has grouped output by 'country_assessment'. You can override
## using the `.groups` argument.
# Two-way ANOVA with interaction effect of the method to define populations and the country
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified * country_assessment , data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = indicator2 ~ defined_populations_simplified * country_assessment,
## data = ind2_data_anova)
##
## Terms:
## defined_populations_simplified country_assessment
## Sum of Squares 4.502732 1.409864
## Deg. of Freedom 7 4
## defined_populations_simplified:country_assessment Residuals
## Sum of Squares 0.010695 14.329461
## Deg. of Freedom 3 294
##
## Residual standard error: 0.2207706
## 49 out of 64 effects not estimable
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value
## defined_populations_simplified 7 4.503 0.6432 13.198
## country_assessment 4 1.410 0.3525 7.232
## defined_populations_simplified:country_assessment 3 0.011 0.0036 0.073
## Residuals 294 14.329 0.0487
## Pr(>F)
## defined_populations_simplified 8.61e-15 ***
## country_assessment 1.45e-05 ***
## defined_populations_simplified:country_assessment 0.974
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
<script data-pagedtable-source type="application/json">
{“columns”:[{“label”:[“”],“name”:[“rn”],“type”:[“”],“align”:[“left”]},{“label”:[“diff”],“name”:[1],“type”:[“dbl”],“align”:[“right”]},{“label”:[“lwr”],“name”:[2],“type”:[“dbl”],“align”:[“right”]},{“label”:[“upr”],“name”:[3],“type”:[“dbl”],“align”:[“right”]},{“label”:[“p
adj”],“name”:[4],“type”:[“dbl”],“align”:[“right”]}],“data”:[{“1”:“-0.4959211”,“2”:“-0.7093227”,“3”:“-0.282519566”,“4”:“2.728338e-10”,“rn”:“management_units-adaptive_traits
management_units”},{“1”:“-0.1294251”,“2”:“-0.2552278”,“3”:“-0.003622326”,“4”:“3.872277e-02”,“rn”:“geographic_boundaries-genetic_clusters”},{“1”:“-0.1718010”,“2”:“-0.3295851”,“3”:“-0.014016829”,“4”:“2.207185e-02”,“rn”:“geographic_boundaries
eco_biogeo_proxies-genetic_clusters”},{“1”:“-0.5007798”,“2”:“-0.6849238”,“3”:“-0.316635841”,“4”:“8.852918e-13”,“rn”:“management_units-genetic_clusters”},{“1”:“-0.4770185”,“2”:“-0.6651598”,“3”:“-0.288877091”,“4”:“5.258016e-12”,“rn”:“management_units-genetic_clusters
geographic_boundaries”},{“1”:“-0.3713547”,“2”:“-0.5300385”,“3”:“-0.212670959”,“4”:“2.006239e-10”,“rn”:“management_units-geographic_boundaries”},{“1”:“-0.4698739”,“2”:“-0.6863652”,“3”:“-0.253382619”,“4”:“4.592031e-09”,“rn”:“management_units-geographic_boundaries
adaptive_traits”},{“1”:“-0.3289788”,“2”:“-0.5140465”,“3”:“-0.143911179”,“4”:“3.317563e-06”,“rn”:“management_units-geographic_boundaries
eco_biogeo_proxies”},{“1”:“-0.4673140”,“2”:“-0.6872072”,“3”:“-0.247420839”,“4”:“1.028749e-08”,“rn”:“management_units-low_freq_combinations”}],“options”:{“columns”:{“min”:{},“max”:[10]},“rows”:{“min”:[10],“max”:[10]},“pages”:{}}}
One-way ANOVA for the effect of the method to define populations on indicator 2, removing the extreme outlier (>1,000 pops)
# subset data without massive outlier
ind2_data_anova<- ind2_data %>%
filter(indicator2<1000)
# summary of n per variable
table(ind2_data_anova$defined_populations_simplified)
##
## adaptive_traits management_units
## 19
## eco_biogeo_proxies
## 30
## genetic_clusters
## 50
## genetic_clusters eco_biogeo_proxies
## 12
## genetic_clusters geographic_boundaries
## 46
## geographic_boundaries
## 180
## geographic_boundaries adaptive_traits
## 32
## geographic_boundaries eco_biogeo_proxies
## 56
## geographic_boundaries management_units
## 17
## low_freq_combinations
## 69
## management_units
## 45
## other
## 9
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified, data=ind2_data_anova)
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## defined_populations_simplified 11 3.349 0.30448 5.42 3.22e-08 ***
## Residuals 553 31.065 0.05618
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Same One-way ANOVA for the effect of the method to define populations on indicator 2, but removing outliers >1000 pops
# subset data without outliers
ind2_data_anova<- ind2_data %>%
filter(indicator2<1000)
# summary of n per variable
table(ind2_data_anova$defined_populations_simplified)
##
## adaptive_traits management_units
## 19
## eco_biogeo_proxies
## 30
## genetic_clusters
## 50
## genetic_clusters eco_biogeo_proxies
## 12
## genetic_clusters geographic_boundaries
## 46
## geographic_boundaries
## 180
## geographic_boundaries adaptive_traits
## 32
## geographic_boundaries eco_biogeo_proxies
## 56
## geographic_boundaries management_units
## 17
## low_freq_combinations
## 69
## management_units
## 45
## other
## 9
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified, data=ind2_data_anova)
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## defined_populations_simplified 11 3.349 0.30448 5.42 3.22e-08 ***
## Residuals 553 31.065 0.05618
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
One-way ANOVA for the effect of the country on indicator 2, removing outliers >1000 pops
# subset data without outliers
ind2_data_anova<- ind2_data %>%
filter(indicator2<1000)
# summary of n per variable
table(ind2_data_anova$country_assessment)
##
## australia belgium colombia france japan
## 28 27 36 34 50
## mexico south_africa sweden united_states
## 28 91 125 146
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ country_assessment, data=ind2_data_anova)
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## country_assessment 8 6.576 0.8220 16.42 <2e-16 ***
## Residuals 556 27.838 0.0501
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
One-way ANOVA for the effect of the taxonomic group on indicator 2, removing outliers >1000 pops and taxonomic groups with too few data
# summary of n per variable
table(ind2_data$taxonomic_group)
##
## amphibian angiosperm bird bryophyte fish
## 63 233 140 5 70
## fungus gymnosperm invertebrate mammal other
## 3 19 148 140 17
## pteridophytes reptile
## 14 72
# subset data
ind2_data_anova<- ind2_data %>%
filter(indicator2<1000) %>%
filter(taxonomic_group %!in% c("fungus", "bryophyte", "other"))
# summary of n per variable
table(ind2_data_anova$taxonomic_group)
##
## amphibian angiosperm bird fish gymnosperm
## 50 141 84 49 9
## invertebrate mammal pteridophytes reptile
## 83 82 8 40
# One way ANOVA
res.anova.extant<-aov(indicator2 ~ taxonomic_group, data=ind2_data_anova)
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## taxonomic_group 8 3.836 0.4794 8.578 4.96e-11 ***
## Residuals 537 30.014 0.0559
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
One-way ANOVA for the effect of the global IUCN on indicator 2, removing outliers >1000 pops and taxonomic groups with too few data
# summary of n per variable
table(ind2_data$global_IUCN)
##
## cr dd en lc not_assessed nt
## 63 15 104 280 259 86
## unknown vu
## 4 112
# subset data
ind2_data_anova<- ind2_data %>%
filter(indicator2<1000) %>%
filter(global_IUCN %!in% c("dd", "unknown"))
# summary of n per variable
table(ind2_data_anova$global_IUCN)
##
## cr en lc not_assessed nt vu
## 37 62 157 166 57 75
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ global_IUCN, data=ind2_data_anova)
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## global_IUCN 5 0.26 0.05136 0.848 0.516
## Residuals 548 33.18 0.06055
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
One-way ANOVA for the effect of the species range on indicator 2
# summary of n per variable
table(ind2_data$species_range)
##
## restricted unknown wide_ranging
## 509 26 389
# subset data
ind2_data_anova<- ind2_data %>%
filter(species_range != "unknown")
# summary of n per variable
table(ind2_data_anova$species_range)
##
## restricted wide_ranging
## 509 389
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ species_range, data=ind2_data_anova)
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## species_range 1 0.43 0.4267 7.094 0.00796 **
## Residuals 548 32.96 0.0601
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 348 observations deleted due to missingness
One-way ANOVA for the effect of the rarity on indicator 2
# summary of n per variable
table(ind2_data_firstmulti$rarity)
##
## not_rare rare_natural rare_recent
## 371 354 152
# subset data
ind2_data_anova<- ind2_data_firstmulti
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ species_range, data=ind2_data_anova)
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value Pr(>F)
## species_range 2 0.51 0.2547 4.163 0.0161 *
## Residuals 528 32.31 0.0612
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 346 observations deleted due to missingness
Two-way ANOVA with interaction effect of the method to define populations and the country, removing outliers >200 pops Question: is interaction he correct model? or should it be additive?
# subset data without outliers
ind2_data_anova<- ind2_data %>%
filter(indicator2<200)
# summary of n per variable
ind2_data_anova %>%
group_by(country_assessment, defined_populations_simplified) %>% summarise(n=n())
# Two-way ANOVA with interaction effect of the method to define populations and the country
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified * country_assessment , data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = indicator2 ~ defined_populations_simplified * country_assessment,
## data = ind2_data_anova)
##
## Terms:
## defined_populations_simplified country_assessment
## Sum of Squares 3.349251 3.683287
## Deg. of Freedom 11 8
## defined_populations_simplified:country_assessment Residuals
## Sum of Squares 1.711872 25.670183
## Deg. of Freedom 38 507
##
## Residual standard error: 0.2250145
## 50 out of 108 effects not estimable
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value
## defined_populations_simplified 11 3.349 0.3045 6.014
## country_assessment 8 3.683 0.4604 9.093
## defined_populations_simplified:country_assessment 38 1.712 0.0450 0.890
## Residuals 507 25.670 0.0506
## Pr(>F)
## defined_populations_simplified 2.88e-09 ***
## country_assessment 1.03e-11 ***
## defined_populations_simplified:country_assessment 0.66
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
Two-way ANOVA with interaction effect of the method to define populations and the country, but keeping only groups with enough n
# variables with enough n
enough_n<-ind2_data %>%
group_by(country_assessment, defined_populations_simplified) %>%
summarise(n=n()) %>%
filter(n>=15)
## `summarise()` has grouped output by 'country_assessment'. You can override
## using the `.groups` argument.
# subset data without outliers and with enough n
ind2_data_anova<- ind2_data %>%
filter(indicator2<200) %>%
# this gives the country
filter(country_assessment==unique(enough_n$country_assessment)[1] &
#this gives the methods for that country (the last [[1]] is to get the results out of a list)
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[1], 2][[1]] |
# the same for rest of countries. Notice the use of & for methods within country and | to change to other country
country_assessment==unique(enough_n$country_assessment)[2] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[2], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[3] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[3], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[4] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[4], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[5] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[5], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[6] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[6], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[7] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[7], 2][[1]] |
country_assessment==unique(enough_n$country_assessment)[8] &
defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[8], 2][[1]])
# summary of n per variable
ind2_data_anova %>%
group_by(country_assessment, defined_populations_simplified) %>% summarise(n=n())
## `summarise()` has grouped output by 'country_assessment'. You can override
## using the `.groups` argument.
# Two-way ANOVA with interaction effect of the method to define populations and the country
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified * country_assessment , data=ind2_data_anova)
res.anova.extant
## Call:
## aov(formula = indicator2 ~ defined_populations_simplified * country_assessment,
## data = ind2_data_anova)
##
## Terms:
## defined_populations_simplified country_assessment
## Sum of Squares 4.502732 1.409864
## Deg. of Freedom 7 4
## defined_populations_simplified:country_assessment Residuals
## Sum of Squares 0.010695 14.329461
## Deg. of Freedom 3 294
##
## Residual standard error: 0.2207706
## 49 out of 64 effects not estimable
## Estimated effects may be unbalanced
summary(res.anova.extant)
## Df Sum Sq Mean Sq F value
## defined_populations_simplified 7 4.503 0.6432 13.198
## country_assessment 4 1.410 0.3525 7.232
## defined_populations_simplified:country_assessment 3 0.011 0.0036 0.073
## Residuals 294 14.329 0.0487
## Pr(>F)
## defined_populations_simplified 8.61e-15 ***
## country_assessment 1.45e-05 ***
## defined_populations_simplified:country_assessment 0.974
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):
# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
Indicator 3 refers to the number (count) of taxa by country in which
genetic monitoring is occurring. This is stored in the variable
temp_gen_monitoring as a “yes/no” answer for each taxon, so
to estimate the indicator, we only need to count how many said “yes”,
keeping only one of the records when the taxon was multiassessed:
indicator3<-ind3_data %>%
# keep only one record if the taxon was assessed more than once within the country
select(country_assessment, taxon, temp_gen_monitoring) %>%
filter(!duplicated(.)) %>%
# count "yes" in tem_gen_monitoring by country
filter(temp_gen_monitoring=="yes") %>%
group_by(country_assessment) %>%
summarise(n_taxon_gen_monitoring= n())
Plot indicator 3 by country:
## from the table counting studies by country
indicator3 %>%
ggplot(aes(x=country_assessment, y=n_taxon_gen_monitoring)) +
geom_bar(stat="identity")
## from raw data to include other variables for color by taxonomic group
ind3_data %>%
# keep only one record if the taxon was assessed more than once within the country
select(country_assessment, taxon, temp_gen_monitoring, taxonomic_group) %>%
filter(!duplicated(.)) %>%
# count "yes" in tem_gen_monitoring by country
filter(temp_gen_monitoring=="yes") %>%
ggplot(aes(x=country_assessment, fill=taxonomic_group)) +
geom_bar() +
scale_fill_manual(values= taxoncolors,
breaks=levels(as.factor(ind3_data$taxonomic_group))) +
theme_bw()
## by iucn
ind3_data$global_IUCN<-metadata$global_IUCN
ind3_data %>%
# keep only one record if the taxon was assessed more than once within the country
select(country_assessment, taxon, temp_gen_monitoring, global_IUCN) %>%
filter(!duplicated(.)) %>%
# count "yes" in tem_gen_monitoring by country
filter(temp_gen_monitoring=="yes") %>%
ggplot(aes(x=country_assessment, fill=global_IUCN)) +
geom_bar() +
scale_fill_manual(values= IUCNcolors, # iucn color codes
breaks=levels(as.factor(ind3_data$global_IUCN))) +
theme_bw()
Relatively few taxa have genetic monitoring, but many have some sort of genetic study. Let’s check that, but first subset the ind3_data keeping only taxa assessed a single time, plust the first record of those assessed multiple times.
#subset keeping only taxa assessed a single time, plust the first record of those assessed multiple times.
ind3_data_firstmulti<-ind3_data[!duplicated(cbind(ind3_data$taxon, ind3_data$country_assessment)), ]
Sankey plot of genetic studies
# transform data to how ggsankey wants it
df <- ind3_data_firstmulti %>%
make_long(country_assessment, temp_gen_monitoring, gen_studies)
# plot
ggplot(df, aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
fill = factor(node),
label = node)) +
geom_sankey(flow.alpha = 0.5,
show.legend = FALSE) +
geom_sankey_label(size = 2.5, color = "black", fill = "white") +
theme_sankey(base_size = 10) +
# manually set flow fill according to desired color
# countries
scale_fill_manual(values=c(scales::hue_pal()(length(unique(ind3_data_firstmulti$country_assessment))),
# traffic light for monitoring
c("darkolivegreen", "brown3", "darkgrey"),
# nice soft colors for gen_studies
c("grey50", "grey35", "grey50", "brown3")),
breaks=c(unique(ind3_data_firstmulti$country_assessment),
unique(ind3_data_firstmulti$temp_gen_monitoring),
unique(ind3_data_firstmulti$gen_studies))) +
xlab("")
Similar, but alluvial to show data colloring the flow by country
# format data as needed
foralluvial_2<-ind3_data_firstmulti %>% group_by(country_assessment, temp_gen_monitoring, gen_studies) %>%
summarise(n=n())
# define colors
my_cols<- gg_color_hue(length(unique(foralluvial_2$country_assessment)))
# we need a vector of colors by country for each row of the dataset, so:
countries<-as.factor(foralluvial_2$country_assessment)
levels(countries)<-my_cols
countries<-as.vector(countries)
# plot
alluvial(foralluvial_2[,1:3], freq = foralluvial_2$n,
col=countries,
blocks=FALSE,
gap.width = 0.5,
cex=.8,
xw = 0.1,
cw = 0.2,
border = NA)
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] cowplot_1.1.1 viridis_0.6.3 viridisLite_0.4.0 alluvial_0.1-2
## [5] ggsankey_0.0.99999 ggplot2_3.4.1 stringr_1.4.0 utile.tools_0.2.7
## [9] dplyr_1.0.9 tidyr_1.2.0
##
## loaded via a namespace (and not attached):
## [1] highr_0.9 pillar_1.7.0 bslib_0.3.1 compiler_4.2.1
## [5] jquerylib_0.1.4 tools_4.2.1 digest_0.6.29 gtable_0.3.0
## [9] jsonlite_1.8.0 evaluate_0.15 lifecycle_1.0.3 tibble_3.1.7
## [13] pkgconfig_2.0.3 rlang_1.0.6 cli_3.6.0 DBI_1.1.3
## [17] rstudioapi_0.13 yaml_2.3.5 xfun_0.31 fastmap_1.1.0
## [21] gridExtra_2.3 withr_2.5.0 knitr_1.39 generics_0.1.3
## [25] vctrs_0.5.2 sass_0.4.1 grid_4.2.1 tidyselect_1.1.2
## [29] glue_1.6.2 R6_2.5.1 fansi_1.0.3 rmarkdown_2.14
## [33] farver_2.1.1 purrr_0.3.4 magrittr_2.0.3 scales_1.2.0
## [37] ellipsis_0.3.2 htmltools_0.5.5 assertthat_0.2.1 colorspace_2.0-3
## [41] labeling_0.4.2 utf8_1.2.2 stringi_1.7.6 munsell_0.5.0
## [45] crayon_1.5.1